added code to handle wiktionary files

d411e2ef · Gerhard Gonter · 87cedb62 · d411e2ef · d411e2ef · d411e2ef
Commit d411e2ef authored Aug 16, 2016 by Gerhard Gonter
--- a/.gitignore
+++ b/.gitignore
 dumps/
 out/
 data/
+wkt-??/
 tmp/
 @*
 *.tys

--- a/lib/WikiData/Utils.pm
+++ b/lib/WikiData/Utils.pm

-use strict;
-
 package WikiData::Utils;

+use strict;
+
 # TODO: make reasonable defaults and a command line option
 sub get_paths
 {

--- a/wdq1.pl
+++ b/wdq1.pl
@@ -3,15 +3,17 @@
 use strict;

 use JSON;
-use Compress::Zlib;

 use Data::Dumper;
 $Data::Dumper::Indent= 1;
+use FileHandle;

 use lib 'lib';
 use WikiData::Utils;
 use WikiData::Property::Filter;

+use FDS;
+
 my $TSV_SEP= "\t";
 # my $OUT_CHUNK_SIZE= 500_000_000; # size of files containing item data in JSON format
 my $OUT_CHUNK_SIZE= 640_000_000; # size of files containing item data in JSON format
@@ -113,10 +115,12 @@ my %props;

  my @item_attrs= qw(labels descriptions aliases claims sitelinks);

+# local *FI= wkutils::open_input($fnm);
 if ($fnm =~ /\.gz$/)
 {
  open (FI, '-|', "gunzip -c '$fnm'") or die "can't gunzip [$fnm]";
 }
+# elsif bunzip ... see wkt1
 else
 {
  open (FI, '<:utf8', $fnm) or die "can't read [$fnm]";
@@ -125,8 +129,16 @@ else
 my $line= 0;
 my $t_start= time();

-mkdir ($data_dir) unless (-d $data_dir);
-mkdir ($out_dir)  unless (-d $out_dir);
+unless (-d $data_dir)
+{
+  print "mkdir $data_dir\n";
+  mkdir ($data_dir);
+}
+unless (-d $out_dir)
+{
+  print "mkdir $out_dir\n";
+  mkdir ($out_dir)
+}

 # item list
 my $fnm_items= $data_dir . '/items.csv';
@@ -135,6 +147,7 @@ local *FO_ITEMS;
 open (FO_ITEMS, '>:utf8', $fnm_items) or die "can't write to [$fnm_items]";
 my @cols1= qw(line pos fo_count fo_pos_beg fo_pos_end id type cnt_label cnt_desc cnt_aliases cnt_claims cnt_sitelink lang label);
 print FO_ITEMS join ($TSV_SEP, @cols1, qw(filtered_props claims)), "\n";
+autoflush FO_ITEMS 1;

 # properties
 my @cols_filt= (@cols1, 'val');
@@ -229,55 +242,6 @@ my %filters=
 );
 my @filters= sort keys %filters;

-# BEGIN output transcription
-local *FO_RECODED;
-my $fo_open= 0;
-my $fo_count= 0;
-my $fo_pos= 0;
-
-
-sub close_fo
-{
-  if ($fo_open)
-  {
-    # print FO_RECODED "]\n";
-    close (FO_RECODED);
-    $fo_open= 0;
-  }
-}
-
-sub open_fo
-{
-  close_fo();
-
-  my $fo_fnm;
-
-  if ($fo_compress == 1)
-  {
-    $fo_fnm= sprintf ("%s/wdq%05d.gz", $out_dir, ++$fo_count);
-    open (FO_RECODED, '|-', "gzip -c >'$fo_fnm'") or die "can't write to [$fo_fnm]";
-  }
-  elsif ($fo_compress == 2)
-  {
-    $fo_fnm= sprintf ("%s/wdq%05d.cmp", $out_dir, ++$fo_count);
-    open (FO_RECODED, '>:raw', $fo_fnm) or die "can't write to [$fo_fnm]";
-  }
-  else
-  {
-    $fo_fnm= sprintf ("%s/wdq%05d", $out_dir, ++$fo_count);
-    open (FO_RECODED, '>:utf8', $fo_fnm) or die "can't write to [$fo_fnm]";
-  }
-
-  $fo_open= 1;
-
-  print "writing dumps to $fo_fnm\n";
-  # print FO_RECODED "[\n";
-  $fo_pos= tell (FO_RECODED);
-}
-# END output transcription
-
-open_fo();
-
 # Property Bitmap Table
 my @id_prop= (); # bitmap table
 my $max_id= -1;
@@ -290,6 +254,10 @@ if ($exp_bitmap)
  open (BM_FILE, '>:raw', $BM_file) or die "can't write to [$BM_file]\n";
 }

+my $fo_rec= new FDS('out_pattern' => "$out_dir/wdq%05d");
+my $fo_count= $fo_rec->open();
+my $fo_pos= 0;
+
 <FI>;
 my $pos;
 LINE: while (1)
@@ -300,9 +268,10 @@ LINE: while (1)

  if ($fo_pos >= $OUT_CHUNK_SIZE)
  {
-    open_fo();
+    $fo_count= $fo_rec->open();
+    $fo_pos= 0;
  }
-  $fo_pos= tell(FO_RECODED);
+  $fo_pos= $fo_rec->tell();

  $line++;
  print join (' ', $line, $pos, $fo_count, $fo_pos), "\n" if (($line % 10_000) == 0);
@@ -364,16 +333,8 @@ LINE: while (1)
  }

  # my $py= substr($l, 0, 30) . '...' . substr ($l, -30);
-  my $px;
-  if ($fo_compress == 2)
-  {
-    $px= print FO_RECODED compress($l);
-  }
-  else
-  {
-    $px= print FO_RECODED $l, "\n";
-  }
-  my $fo_pos_end= tell (FO_RECODED);
+  my $px= $fo_rec->print($l);
+  my $fo_pos_end= $fo_rec->tell();
  # print "px=[$px] l=[$py]\n";

  foreach my $a (keys %$j) { $attrs{$a}++; }
@@ -492,7 +453,7 @@ LINE: while (1)
 }

 close (FI);
-close_fo();
+$fo_rec->close();

 # check if there are multiple definitions of the same property and flatten the structure a bit
 open (PROPS_LIST, '>:utf8', $data_dir . '/props.csv') or die;

--- a/wdq2.pl
+++ b/wdq2.pl
@@ -16,11 +16,14 @@ use Data::Dumper;
 $Data::Dumper::Indent= 1;

 use WikiData::Utils;
+use Wiktionary::Utils;
 use PDS;

 my $seq= 'a';
 my $date= '2016-07-04';
+my $lang= undef;
 my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
+my $cmp_fnm_pattern= '%s/wdq%05d.cmp';

 # my $op_mode= 'find_items';
 my $op_mode= 'get_items';
@@ -40,6 +43,7 @@ while (my $arg= shift (@ARGV))

       if ($an eq 'date') { $date= $av || shift (@ARGV); $upd_paths= 1; }
    elsif ($an eq 'seq')  { $seq=  $av || shift (@ARGV); $upd_paths= 1; }
+    elsif ($an eq 'lang') { $lang=  $av || shift (@ARGV); $upd_paths= 1; }
    elsif ($an eq 'scan')  { $op_mode= 'scan'; }
    else
    {
@@ -57,11 +61,25 @@ while (my $arg= shift (@ARGV))
 }

 # prepare items list
-($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq) if ($upd_paths);
+my $fnm_items;
+if ($upd_paths)
+{
+  if (defined ($lang))
+  { # must be Wiktionary, if there is a language defined ...
+    ($fnm, $data_dir, $out_dir)= Wiktionary::Utils::get_paths ($lang, $date, $seq);
+    print "ATTN: wiktionary mode!\n";
+    $fnm_items= join ('/', $data_dir, "items.csv");
+    $cmp_fnm_pattern= '%s/wkt%05d.cmp';
+  }
+  else
+  {
+    ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
+    $fnm_items= join ('/', $data_dir, 'items.csv');
+  }
+}
 # print __LINE__, " date=[$date] seq=[$seq] data_dir=[$data_dir]\n";
 # TODO: fails if there is no data at the given date/seq

-my $fnm_items= join ('/', $data_dir, 'items.csv');

 my $csv= new Util::Simple_CSV (separator => "\t");

@@ -156,7 +174,11 @@ sub parse_idx_file

    my $rec_num;
    if ($id =~ m#^Q(\d+)$#)
-    {
+    { # Wikidata
+      $rec_num= $1;
+    }
+    elsif ($id =~ m#^(\d+)$#)
+    { # Wiktionary
      $rec_num= $1;
    }
    else
@@ -248,6 +270,10 @@ sub get_items
    {
      push (@rec_nums, $1);
    }
+    elsif ($item =~ m#^(\d+)$#)
+    {
+      push (@rec_nums, $1);
+    }
  }
  # print __LINE__, " recs: ", join (' ', @rec_nums), "\n";

@@ -298,7 +324,7 @@ sub load_item
  
  my ($id, $f_num, $beg, $end)= map { $row->{$_} } qw(id fo_count fo_pos_beg fo_pos_end);
  my $size= $end-$beg;
-  my $fnm_data= sprintf ('%s/wdq%05d.cmp', $out_dir, $row->{'fo_count'});
+  my $fnm_data= sprintf ($cmp_fnm_pattern, $out_dir, $row->{'fo_count'});

  print "id=[$id] f_num=[$f_num] fnm_data=[$fnm_data] beg=[$beg] end=[$end] size=[$size]\n";

@@ -306,11 +332,23 @@ sub load_item
  seek (FD, $beg, 0);
  my $buffer;
  sysread (FD, $buffer, $size);
-  my $json= uncompress ($buffer);
-  # print "json: ", Dumper ($json);
-  my $data= JSON::decode_json ($json);
-  print "data: ", Dumper ($data);
+  my $block= uncompress ($buffer);
+  # print "block: ", Dumper ($block);

-  $data;
+  if (defined ($lang))
+  {
+    # print "buffer: ", Dumper ($buffer);
+    # print "block: ", Dumper (\$block);
+    print '='x72, "\n", "block:\n", $block, "\n", '='x72, "\n";
+
+    return $block;
+  }
+  else
+  {
+    my $json= JSON::decode_json ($block);
+    print "json: ", Dumper ($json);
+
+    return $json;
+  }
 }

--- a/wkt1.pl
+++ b/wkt1.pl
@@ -6,6 +6,7 @@ use JSON;
 use FileHandle;

 use Util::JSON;
+use Util::Simple_CSV;

 use Data::Dumper;
 $Data::Dumper::Indent= 1;
@@ -19,8 +20,8 @@ use FDS;
 my $TSV_SEP= "\t";
 # my $OUT_CHUNK_SIZE= 500_000_000; # size of files containing item data in JSON format
 my $OUT_CHUNK_SIZE= 640_000_000; # size of files containing item data in JSON format
-# my $MAX_INPUT_LINES= undef;
-# not used! my $MAX_INPUT_LINES= 100_000; # for debugging to limit processing time
+my $MAX_INPUT_LINES= undef;
+# my $MAX_INPUT_LINES= 100_000; # for debugging to limit processing time

 my $lang= 'de';
 my $seq= 'a';
@@ -242,11 +243,21 @@ LINE: while (1)

    # statistics
    $ns{$frame{ns}}->{use_count}++;
+
+    last if (defined ($MAX_INPUT_LINES) && $line > $MAX_INPUT_LINES);
  }
 }

-  my $fnm_ns= join ('/', $data_dir, 'namespaces.json');
-  print "saving namespaces to [$fnm_ns]\n";
-  Util::JSON::write_json_file ($fnm_ns, \%ns);
+  my $fnm_ns_json= join ('/', $data_dir, 'namespaces.json');
+  my $fnm_ns_csv= join ('/', $data_dir, 'namespaces.csv');
+  print "saving namespaces to [$fnm_ns_json]\n";
+  Util::JSON::write_json_file ($fnm_ns_json, \%ns);
+
+  my @ns= map { $ns{$_} } sort { $a <=> $b } keys %ns;
+  my $csv= new Util::Simple_CSV ('separator' => "\t", 'no_array' => 1);
+  $csv->define_columns (qw(ns_id use_count ns_case ns_name));
+  $csv->{data}= \@ns;
+  $csv->save_csv_file(filename => $fnm_ns_csv);

+  1;
 }