Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
W
wikidata-dump-processor
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Gerhard Gonter
wikidata-dump-processor
Commits
d411e2ef
Commit
d411e2ef
authored
Aug 16, 2016
by
Gerhard Gonter
Browse files
Options
Downloads
Patches
Plain Diff
added code to handle wiktionary files
parent
87cedb62
No related branches found
No related tags found
No related merge requests found
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
.gitignore
+1
-0
1 addition, 0 deletions
.gitignore
lib/WikiData/Utils.pm
+2
-2
2 additions, 2 deletions
lib/WikiData/Utils.pm
wdq1.pl
+26
-65
26 additions, 65 deletions
wdq1.pl
wdq2.pl
+47
-9
47 additions, 9 deletions
wdq2.pl
wkt1.pl
+16
-5
16 additions, 5 deletions
wkt1.pl
with
92 additions
and
81 deletions
.gitignore
+
1
−
0
View file @
d411e2ef
dumps/
out/
data/
wkt-??/
tmp/
@*
*.tys
...
...
This diff is collapsed.
Click to expand it.
lib/WikiData/Utils.pm
+
2
−
2
View file @
d411e2ef
use
strict
;
package
WikiData::
Utils
;
use
strict
;
# TODO: make reasonable defaults and a command line option
sub
get_paths
{
...
...
This diff is collapsed.
Click to expand it.
wdq1.pl
+
26
−
65
View file @
d411e2ef
...
...
@@ -3,15 +3,17 @@
use
strict
;
use
JSON
;
use
Compress::
Zlib
;
use
Data::
Dumper
;
$
Data::Dumper::
Indent
=
1
;
use
FileHandle
;
use
lib
'
lib
';
use
WikiData::
Utils
;
use
WikiData::Property::
Filter
;
use
FDS
;
my
$TSV_SEP
=
"
\t
";
# my $OUT_CHUNK_SIZE= 500_000_000; # size of files containing item data in JSON format
my
$OUT_CHUNK_SIZE
=
640_000_000
;
# size of files containing item data in JSON format
...
...
@@ -113,10 +115,12 @@ my %props;
my
@item_attrs
=
qw(labels descriptions aliases claims sitelinks)
;
# local *FI= wkutils::open_input($fnm);
if
(
$fnm
=~
/\.gz$/
)
{
open
(
FI
,
'
-|
',
"
gunzip -c '
$fnm
'
")
or
die
"
can't gunzip [
$fnm
]
";
}
# elsif bunzip ... see wkt1
else
{
open
(
FI
,
'
<:utf8
',
$fnm
)
or
die
"
can't read [
$fnm
]
";
...
...
@@ -125,8 +129,16 @@ else
my
$line
=
0
;
my
$t_start
=
time
();
mkdir
(
$data_dir
)
unless
(
-
d
$data_dir
);
mkdir
(
$out_dir
)
unless
(
-
d
$out_dir
);
unless
(
-
d
$data_dir
)
{
print
"
mkdir
$data_dir
\n
";
mkdir
(
$data_dir
);
}
unless
(
-
d
$out_dir
)
{
print
"
mkdir
$out_dir
\n
";
mkdir
(
$out_dir
)
}
# item list
my
$fnm_items
=
$data_dir
.
'
/items.csv
';
...
...
@@ -135,6 +147,7 @@ local *FO_ITEMS;
open
(
FO_ITEMS
,
'
>:utf8
',
$fnm_items
)
or
die
"
can't write to [
$fnm_items
]
";
my
@cols1
=
qw(line pos fo_count fo_pos_beg fo_pos_end id type cnt_label cnt_desc cnt_aliases cnt_claims cnt_sitelink lang label)
;
print
FO_ITEMS
join
(
$TSV_SEP
,
@cols1
,
qw(filtered_props claims)
),
"
\n
";
autoflush
FO_ITEMS
1
;
# properties
my
@cols_filt
=
(
@cols1
,
'
val
');
...
...
@@ -229,55 +242,6 @@ my %filters=
);
my
@filters
=
sort
keys
%filters
;
# BEGIN output transcription
local
*FO_RECODED
;
my
$fo_open
=
0
;
my
$fo_count
=
0
;
my
$fo_pos
=
0
;
sub
close_fo
{
if
(
$fo_open
)
{
# print FO_RECODED "]\n";
close
(
FO_RECODED
);
$fo_open
=
0
;
}
}
sub
open_fo
{
close_fo
();
my
$fo_fnm
;
if
(
$fo_compress
==
1
)
{
$fo_fnm
=
sprintf
("
%s/wdq%05d.gz
",
$out_dir
,
++
$fo_count
);
open
(
FO_RECODED
,
'
|-
',
"
gzip -c >'
$fo_fnm
'
")
or
die
"
can't write to [
$fo_fnm
]
";
}
elsif
(
$fo_compress
==
2
)
{
$fo_fnm
=
sprintf
("
%s/wdq%05d.cmp
",
$out_dir
,
++
$fo_count
);
open
(
FO_RECODED
,
'
>:raw
',
$fo_fnm
)
or
die
"
can't write to [
$fo_fnm
]
";
}
else
{
$fo_fnm
=
sprintf
("
%s/wdq%05d
",
$out_dir
,
++
$fo_count
);
open
(
FO_RECODED
,
'
>:utf8
',
$fo_fnm
)
or
die
"
can't write to [
$fo_fnm
]
";
}
$fo_open
=
1
;
print
"
writing dumps to
$fo_fnm
\n
";
# print FO_RECODED "[\n";
$fo_pos
=
tell
(
FO_RECODED
);
}
# END output transcription
open_fo
();
# Property Bitmap Table
my
@id_prop
=
();
# bitmap table
my
$max_id
=
-
1
;
...
...
@@ -290,6 +254,10 @@ if ($exp_bitmap)
open
(
BM_FILE
,
'
>:raw
',
$BM_file
)
or
die
"
can't write to [
$BM_file
]
\n
";
}
my
$fo_rec
=
new
FDS
('
out_pattern
'
=>
"
$out_dir
/wdq%05d
");
my
$fo_count
=
$fo_rec
->
open
();
my
$fo_pos
=
0
;
<
FI
>
;
my
$pos
;
LINE:
while
(
1
)
...
...
@@ -300,9 +268,10 @@ LINE: while (1)
if
(
$fo_pos
>=
$OUT_CHUNK_SIZE
)
{
open_fo
();
$fo_count
=
$fo_rec
->
open
();
$fo_pos
=
0
;
}
$fo_pos
=
tell
(
FO_RECODED
);
$fo_pos
=
$fo_rec
->
tell
(
);
$line
++
;
print
join
('
',
$line
,
$pos
,
$fo_count
,
$fo_pos
),
"
\n
"
if
((
$line
%
10_000
)
==
0
);
...
...
@@ -364,16 +333,8 @@ LINE: while (1)
}
# my $py= substr($l, 0, 30) . '...' . substr ($l, -30);
my
$px
;
if
(
$fo_compress
==
2
)
{
$px
=
print
FO_RECODED
compress
(
$l
);
}
else
{
$px
=
print
FO_RECODED
$l
,
"
\n
";
}
my
$fo_pos_end
=
tell
(
FO_RECODED
);
my
$px
=
$fo_rec
->
print
(
$l
);
my
$fo_pos_end
=
$fo_rec
->
tell
();
# print "px=[$px] l=[$py]\n";
foreach
my
$a
(
keys
%$j
)
{
$attrs
{
$a
}
++
;
}
...
...
@@ -492,7 +453,7 @@ LINE: while (1)
}
close
(
FI
);
close
_fo
();
$fo_rec
->
close
();
# check if there are multiple definitions of the same property and flatten the structure a bit
open
(
PROPS_LIST
,
'
>:utf8
',
$data_dir
.
'
/props.csv
')
or
die
;
...
...
This diff is collapsed.
Click to expand it.
wdq2.pl
+
47
−
9
View file @
d411e2ef
...
...
@@ -16,11 +16,14 @@ use Data::Dumper;
$
Data::Dumper::
Indent
=
1
;
use
WikiData::
Utils
;
use
Wiktionary::
Utils
;
use
PDS
;
my
$seq
=
'
a
';
my
$date
=
'
2016-07-04
';
my
$lang
=
undef
;
my
(
$fnm
,
$data_dir
,
$out_dir
)
=
WikiData::Utils::
get_paths
(
$date
,
$seq
);
my
$cmp_fnm_pattern
=
'
%s/wdq%05d.cmp
';
# my $op_mode= 'find_items';
my
$op_mode
=
'
get_items
';
...
...
@@ -40,6 +43,7 @@ while (my $arg= shift (@ARGV))
if
(
$an
eq
'
date
')
{
$date
=
$av
||
shift
(
@ARGV
);
$upd_paths
=
1
;
}
elsif
(
$an
eq
'
seq
')
{
$seq
=
$av
||
shift
(
@ARGV
);
$upd_paths
=
1
;
}
elsif
(
$an
eq
'
lang
')
{
$lang
=
$av
||
shift
(
@ARGV
);
$upd_paths
=
1
;
}
elsif
(
$an
eq
'
scan
')
{
$op_mode
=
'
scan
';
}
else
{
...
...
@@ -57,11 +61,25 @@ while (my $arg= shift (@ARGV))
}
# prepare items list
(
$fnm
,
$data_dir
,
$out_dir
)
=
WikiData::Utils::
get_paths
(
$date
,
$seq
)
if
(
$upd_paths
);
my
$fnm_items
;
if
(
$upd_paths
)
{
if
(
defined
(
$lang
))
{
# must be Wiktionary, if there is a language defined ...
(
$fnm
,
$data_dir
,
$out_dir
)
=
Wiktionary::Utils::
get_paths
(
$lang
,
$date
,
$seq
);
print
"
ATTN: wiktionary mode!
\n
";
$fnm_items
=
join
('
/
',
$data_dir
,
"
items.csv
");
$cmp_fnm_pattern
=
'
%s/wkt%05d.cmp
';
}
else
{
(
$fnm
,
$data_dir
,
$out_dir
)
=
WikiData::Utils::
get_paths
(
$date
,
$seq
);
$fnm_items
=
join
('
/
',
$data_dir
,
'
items.csv
');
}
}
# print __LINE__, " date=[$date] seq=[$seq] data_dir=[$data_dir]\n";
# TODO: fails if there is no data at the given date/seq
my
$fnm_items
=
join
('
/
',
$data_dir
,
'
items.csv
');
my
$csv
=
new
Util::
Simple_CSV
(
separator
=>
"
\t
");
...
...
@@ -156,7 +174,11 @@ sub parse_idx_file
my
$rec_num
;
if
(
$id
=~
m#^Q(\d+)$#
)
{
{
# Wikidata
$rec_num
=
$
1
;
}
elsif
(
$id
=~
m#^(\d+)$#
)
{
# Wiktionary
$rec_num
=
$
1
;
}
else
...
...
@@ -248,6 +270,10 @@ sub get_items
{
push
(
@rec_nums
,
$
1
);
}
elsif
(
$item
=~
m#^(\d+)$#
)
{
push
(
@rec_nums
,
$
1
);
}
}
# print __LINE__, " recs: ", join (' ', @rec_nums), "\n";
...
...
@@ -298,7 +324,7 @@ sub load_item
my
(
$id
,
$f_num
,
$beg
,
$end
)
=
map
{
$row
->
{
$_
}
}
qw(id fo_count fo_pos_beg fo_pos_end)
;
my
$size
=
$end
-
$beg
;
my
$fnm_data
=
sprintf
(
'
%s/wdq%05d.cmp
'
,
$out_dir
,
$row
->
{'
fo_count
'});
my
$fnm_data
=
sprintf
(
$cmp_fnm_pattern
,
$out_dir
,
$row
->
{'
fo_count
'});
print
"
id=[
$id
] f_num=[
$f_num
] fnm_data=[
$fnm_data
] beg=[
$beg
] end=[
$end
] size=[
$size
]
\n
";
...
...
@@ -306,11 +332,23 @@ sub load_item
seek
(
FD
,
$beg
,
0
);
my
$buffer
;
sysread
(
FD
,
$buffer
,
$size
);
my
$json
=
uncompress
(
$buffer
);
# print "json: ", Dumper ($json);
my
$data
=
JSON::
decode_json
(
$json
);
print
"
data:
",
Dumper
(
$data
);
my
$block
=
uncompress
(
$buffer
);
# print "block: ", Dumper ($block);
$data
;
if
(
defined
(
$lang
))
{
# print "buffer: ", Dumper ($buffer);
# print "block: ", Dumper (\$block);
print
'
=
'
x72
,
"
\n
",
"
block:
\n
",
$block
,
"
\n
",
'
=
'
x72
,
"
\n
";
return
$block
;
}
else
{
my
$json
=
JSON::
decode_json
(
$block
);
print
"
json:
",
Dumper
(
$json
);
return
$json
;
}
}
This diff is collapsed.
Click to expand it.
wkt1.pl
+
16
−
5
View file @
d411e2ef
...
...
@@ -6,6 +6,7 @@ use JSON;
use
FileHandle
;
use
Util::
JSON
;
use
Util::
Simple_CSV
;
use
Data::
Dumper
;
$
Data::Dumper::
Indent
=
1
;
...
...
@@ -19,8 +20,8 @@ use FDS;
my
$TSV_SEP
=
"
\t
";
# my $OUT_CHUNK_SIZE= 500_000_000; # size of files containing item data in JSON format
my
$OUT_CHUNK_SIZE
=
640_000_000
;
# size of files containing item data in JSON format
#
my $MAX_INPUT_LINES= undef;
#
not used!
my $MAX_INPUT_LINES= 100_000; # for debugging to limit processing time
my
$MAX_INPUT_LINES
=
undef
;
# my $MAX_INPUT_LINES= 100_000; # for debugging to limit processing time
my
$lang
=
'
de
';
my
$seq
=
'
a
';
...
...
@@ -242,11 +243,21 @@ LINE: while (1)
# statistics
$ns
{
$frame
{
ns
}}
->
{
use_count
}
++
;
last
if
(
defined
(
$MAX_INPUT_LINES
)
&&
$line
>
$MAX_INPUT_LINES
);
}
}
my
$fnm_ns
=
join
('
/
',
$data_dir
,
'
namespaces.json
');
print
"
saving namespaces to [
$fnm_ns
]
\n
";
Util::JSON::
write_json_file
(
$fnm_ns
,
\
%ns
);
my
$fnm_ns_json
=
join
('
/
',
$data_dir
,
'
namespaces.json
');
my
$fnm_ns_csv
=
join
('
/
',
$data_dir
,
'
namespaces.csv
');
print
"
saving namespaces to [
$fnm_ns_json
]
\n
";
Util::JSON::
write_json_file
(
$fnm_ns_json
,
\
%ns
);
my
@ns
=
map
{
$ns
{
$_
}
}
sort
{
$a
<=>
$b
}
keys
%ns
;
my
$csv
=
new
Util::
Simple_CSV
('
separator
'
=>
"
\t
",
'
no_array
'
=>
1
);
$csv
->
define_columns
(
qw(ns_id use_count ns_case ns_name)
);
$csv
->
{
data
}
=
\
@ns
;
$csv
->
save_csv_file
(
filename
=>
$fnm_ns_csv
);
1
;
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment