Skip to content
Snippets Groups Projects
Commit d411e2ef authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

added code to handle wiktionary files

parent 87cedb62
No related branches found
No related tags found
No related merge requests found
dumps/
out/
data/
wkt-??/
tmp/
@*
*.tys
......
use strict;
package WikiData::Utils;
use strict;
# TODO: make reasonable defaults and a command line option
sub get_paths
{
......
......@@ -3,15 +3,17 @@
use strict;
use JSON;
use Compress::Zlib;
use Data::Dumper;
$Data::Dumper::Indent= 1;
use FileHandle;
use lib 'lib';
use WikiData::Utils;
use WikiData::Property::Filter;
use FDS;
my $TSV_SEP= "\t";
# my $OUT_CHUNK_SIZE= 500_000_000; # size of files containing item data in JSON format
my $OUT_CHUNK_SIZE= 640_000_000; # size of files containing item data in JSON format
......@@ -113,10 +115,12 @@ my %props;
my @item_attrs= qw(labels descriptions aliases claims sitelinks);
# local *FI= wkutils::open_input($fnm);
if ($fnm =~ /\.gz$/)
{
open (FI, '-|', "gunzip -c '$fnm'") or die "can't gunzip [$fnm]";
}
# elsif bunzip ... see wkt1
else
{
open (FI, '<:utf8', $fnm) or die "can't read [$fnm]";
......@@ -125,8 +129,16 @@ else
my $line= 0;
my $t_start= time();
mkdir ($data_dir) unless (-d $data_dir);
mkdir ($out_dir) unless (-d $out_dir);
unless (-d $data_dir)
{
print "mkdir $data_dir\n";
mkdir ($data_dir);
}
unless (-d $out_dir)
{
print "mkdir $out_dir\n";
mkdir ($out_dir)
}
# item list
my $fnm_items= $data_dir . '/items.csv';
......@@ -135,6 +147,7 @@ local *FO_ITEMS;
open (FO_ITEMS, '>:utf8', $fnm_items) or die "can't write to [$fnm_items]";
my @cols1= qw(line pos fo_count fo_pos_beg fo_pos_end id type cnt_label cnt_desc cnt_aliases cnt_claims cnt_sitelink lang label);
print FO_ITEMS join ($TSV_SEP, @cols1, qw(filtered_props claims)), "\n";
autoflush FO_ITEMS 1;
# properties
my @cols_filt= (@cols1, 'val');
......@@ -229,55 +242,6 @@ my %filters=
);
my @filters= sort keys %filters;
# BEGIN output transcription
local *FO_RECODED;
my $fo_open= 0;
my $fo_count= 0;
my $fo_pos= 0;
sub close_fo
{
if ($fo_open)
{
# print FO_RECODED "]\n";
close (FO_RECODED);
$fo_open= 0;
}
}
sub open_fo
{
close_fo();
my $fo_fnm;
if ($fo_compress == 1)
{
$fo_fnm= sprintf ("%s/wdq%05d.gz", $out_dir, ++$fo_count);
open (FO_RECODED, '|-', "gzip -c >'$fo_fnm'") or die "can't write to [$fo_fnm]";
}
elsif ($fo_compress == 2)
{
$fo_fnm= sprintf ("%s/wdq%05d.cmp", $out_dir, ++$fo_count);
open (FO_RECODED, '>:raw', $fo_fnm) or die "can't write to [$fo_fnm]";
}
else
{
$fo_fnm= sprintf ("%s/wdq%05d", $out_dir, ++$fo_count);
open (FO_RECODED, '>:utf8', $fo_fnm) or die "can't write to [$fo_fnm]";
}
$fo_open= 1;
print "writing dumps to $fo_fnm\n";
# print FO_RECODED "[\n";
$fo_pos= tell (FO_RECODED);
}
# END output transcription
open_fo();
# Property Bitmap Table
my @id_prop= (); # bitmap table
my $max_id= -1;
......@@ -290,6 +254,10 @@ if ($exp_bitmap)
open (BM_FILE, '>:raw', $BM_file) or die "can't write to [$BM_file]\n";
}
my $fo_rec= new FDS('out_pattern' => "$out_dir/wdq%05d");
my $fo_count= $fo_rec->open();
my $fo_pos= 0;
<FI>;
my $pos;
LINE: while (1)
......@@ -300,9 +268,10 @@ LINE: while (1)
if ($fo_pos >= $OUT_CHUNK_SIZE)
{
open_fo();
$fo_count= $fo_rec->open();
$fo_pos= 0;
}
$fo_pos= tell(FO_RECODED);
$fo_pos= $fo_rec->tell();
$line++;
print join (' ', $line, $pos, $fo_count, $fo_pos), "\n" if (($line % 10_000) == 0);
......@@ -364,16 +333,8 @@ LINE: while (1)
}
# my $py= substr($l, 0, 30) . '...' . substr ($l, -30);
my $px;
if ($fo_compress == 2)
{
$px= print FO_RECODED compress($l);
}
else
{
$px= print FO_RECODED $l, "\n";
}
my $fo_pos_end= tell (FO_RECODED);
my $px= $fo_rec->print($l);
my $fo_pos_end= $fo_rec->tell();
# print "px=[$px] l=[$py]\n";
foreach my $a (keys %$j) { $attrs{$a}++; }
......@@ -492,7 +453,7 @@ LINE: while (1)
}
close (FI);
close_fo();
$fo_rec->close();
# check if there are multiple definitions of the same property and flatten the structure a bit
open (PROPS_LIST, '>:utf8', $data_dir . '/props.csv') or die;
......
......@@ -16,11 +16,14 @@ use Data::Dumper;
$Data::Dumper::Indent= 1;
use WikiData::Utils;
use Wiktionary::Utils;
use PDS;
my $seq= 'a';
my $date= '2016-07-04';
my $lang= undef;
my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
my $cmp_fnm_pattern= '%s/wdq%05d.cmp';
# my $op_mode= 'find_items';
my $op_mode= 'get_items';
......@@ -40,6 +43,7 @@ while (my $arg= shift (@ARGV))
if ($an eq 'date') { $date= $av || shift (@ARGV); $upd_paths= 1; }
elsif ($an eq 'seq') { $seq= $av || shift (@ARGV); $upd_paths= 1; }
elsif ($an eq 'lang') { $lang= $av || shift (@ARGV); $upd_paths= 1; }
elsif ($an eq 'scan') { $op_mode= 'scan'; }
else
{
......@@ -57,11 +61,25 @@ while (my $arg= shift (@ARGV))
}
# prepare items list
($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq) if ($upd_paths);
my $fnm_items;
if ($upd_paths)
{
if (defined ($lang))
{ # must be Wiktionary, if there is a language defined ...
($fnm, $data_dir, $out_dir)= Wiktionary::Utils::get_paths ($lang, $date, $seq);
print "ATTN: wiktionary mode!\n";
$fnm_items= join ('/', $data_dir, "items.csv");
$cmp_fnm_pattern= '%s/wkt%05d.cmp';
}
else
{
($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
$fnm_items= join ('/', $data_dir, 'items.csv');
}
}
# print __LINE__, " date=[$date] seq=[$seq] data_dir=[$data_dir]\n";
# TODO: fails if there is no data at the given date/seq
my $fnm_items= join ('/', $data_dir, 'items.csv');
my $csv= new Util::Simple_CSV (separator => "\t");
......@@ -156,7 +174,11 @@ sub parse_idx_file
my $rec_num;
if ($id =~ m#^Q(\d+)$#)
{
{ # Wikidata
$rec_num= $1;
}
elsif ($id =~ m#^(\d+)$#)
{ # Wiktionary
$rec_num= $1;
}
else
......@@ -248,6 +270,10 @@ sub get_items
{
push (@rec_nums, $1);
}
elsif ($item =~ m#^(\d+)$#)
{
push (@rec_nums, $1);
}
}
# print __LINE__, " recs: ", join (' ', @rec_nums), "\n";
......@@ -298,7 +324,7 @@ sub load_item
my ($id, $f_num, $beg, $end)= map { $row->{$_} } qw(id fo_count fo_pos_beg fo_pos_end);
my $size= $end-$beg;
my $fnm_data= sprintf ('%s/wdq%05d.cmp', $out_dir, $row->{'fo_count'});
my $fnm_data= sprintf ($cmp_fnm_pattern, $out_dir, $row->{'fo_count'});
print "id=[$id] f_num=[$f_num] fnm_data=[$fnm_data] beg=[$beg] end=[$end] size=[$size]\n";
......@@ -306,11 +332,23 @@ sub load_item
seek (FD, $beg, 0);
my $buffer;
sysread (FD, $buffer, $size);
my $json= uncompress ($buffer);
# print "json: ", Dumper ($json);
my $data= JSON::decode_json ($json);
print "data: ", Dumper ($data);
my $block= uncompress ($buffer);
# print "block: ", Dumper ($block);
$data;
if (defined ($lang))
{
# print "buffer: ", Dumper ($buffer);
# print "block: ", Dumper (\$block);
print '='x72, "\n", "block:\n", $block, "\n", '='x72, "\n";
return $block;
}
else
{
my $json= JSON::decode_json ($block);
print "json: ", Dumper ($json);
return $json;
}
}
......@@ -6,6 +6,7 @@ use JSON;
use FileHandle;
use Util::JSON;
use Util::Simple_CSV;
use Data::Dumper;
$Data::Dumper::Indent= 1;
......@@ -19,8 +20,8 @@ use FDS;
my $TSV_SEP= "\t";
# my $OUT_CHUNK_SIZE= 500_000_000; # size of files containing item data in JSON format
my $OUT_CHUNK_SIZE= 640_000_000; # size of files containing item data in JSON format
# my $MAX_INPUT_LINES= undef;
# not used! my $MAX_INPUT_LINES= 100_000; # for debugging to limit processing time
my $MAX_INPUT_LINES= undef;
# my $MAX_INPUT_LINES= 100_000; # for debugging to limit processing time
my $lang= 'de';
my $seq= 'a';
......@@ -242,11 +243,21 @@ LINE: while (1)
# statistics
$ns{$frame{ns}}->{use_count}++;
last if (defined ($MAX_INPUT_LINES) && $line > $MAX_INPUT_LINES);
}
}
my $fnm_ns= join ('/', $data_dir, 'namespaces.json');
print "saving namespaces to [$fnm_ns]\n";
Util::JSON::write_json_file ($fnm_ns, \%ns);
my $fnm_ns_json= join ('/', $data_dir, 'namespaces.json');
my $fnm_ns_csv= join ('/', $data_dir, 'namespaces.csv');
print "saving namespaces to [$fnm_ns_json]\n";
Util::JSON::write_json_file ($fnm_ns_json, \%ns);
my @ns= map { $ns{$_} } sort { $a <=> $b } keys %ns;
my $csv= new Util::Simple_CSV ('separator' => "\t", 'no_array' => 1);
$csv->define_columns (qw(ns_id use_count ns_case ns_name));
$csv->{data}= \@ns;
$csv->save_csv_file(filename => $fnm_ns_csv);
1;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment