updated notes

988163ac · Gerhard Gonter · d411e2ef · 988163ac · 988163ac · 988163ac
Commit 988163ac authored Aug 17, 2016 by Gerhard Gonter
--- a/README.textile
+++ b/README.textile
@@ -6,10 +6,11 @@ h2. quick usage

 <pre>
 script
-./wdq1.pl --date 2015-08-31
+./wdq1.pl --date 2015-08-16
+./wdq2.pl --date 2015-08-16 --scan
 </pre>

-The script will run for several hours (2015-08-31 took 2.5 hours on my
+The scripts will run for several hours (2016-08-15 took 4.5 hours on my
 machine), so it might be useful to record log messages into a transcript
 file.

@@ -27,10 +28,28 @@ element by element to a series of output files.
 | props.csv | property catalog |
 | P####.csv | filtered property #### |

-h3. out/wdq#####.cmp
+h2. wkt1.pl
+
+TODO: describe ...
+
+h2. TODO: gnd1.pl
+
+TODO: write and describe ...
+
+h2. wdq2.pl
+
+Creates an index for items.csv to be able to load individual frames
+from the item store and render them to STDOUT.
+
+TODO:
+* factor out at least the rendering step into a library for other scripts
+  to use.
+
+h3. data/out/wdq#####.cmp

 Each item as a JSON structure is compressed individually and written to
-a file with this name pattern.  The positional information in the items and P-catalogs are intended for subsequent processing steps.
+a file with this name pattern.  The positional information in the items
+and P-catalogs are intended for subsequent processing steps (see wdq2.pl).

 h3. CSV files


--- a/lib/WikiData/Utils.pm
+++ b/lib/WikiData/Utils.pm
@@ -3,7 +3,10 @@ package WikiData::Utils;

 use strict;

-# TODO: make reasonable defaults and a command line option
+# TODO:
+# * make reasonable defaults and a command line option
+# * Wiktionary::Utils is more or less the same thing with other defaults; *unify* these modules!
+
 sub get_paths
 {
  my $date= shift;

--- a/wdq1.pl
+++ b/wdq1.pl
@@ -18,13 +18,13 @@ my $TSV_SEP= "\t";
 # my $OUT_CHUNK_SIZE= 500_000_000; # size of files containing item data in JSON format
 my $OUT_CHUNK_SIZE= 640_000_000; # size of files containing item data in JSON format
 my $MAX_INPUT_LINES= undef;
-# my $MAX_INPUT_LINES= 100_000; # for debugging to limit processing time
+# my $MAX_INPUT_LINES= 100_000; # for debugging to limit processing time; TODO: add commandline option

 my $exp_bitmap= 0; # 1..does not work; 2..makes no sense, too sparsely populated arrays
 # not used my $LR_max_propid= 1930; # dump from 20150608

 my $seq= 'a';
-my $date= '2015-12-28'; # maybe a config file is in order to set up the defaults...
+my $date= '2016-08-16'; # maybe a config file is in order to set up the defaults...
 my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
 my $upd_paths= 0;

@@ -46,6 +46,7 @@ while (my $arg= shift (@ARGV))

       if ($an eq 'date') { $date= $av || shift (@ARGV); $upd_paths= 1; }
    elsif ($an eq 'seq')  { $seq=  $av || shift (@ARGV); $upd_paths= 1; }
+    elsif ($an eq 'max-lines') { $MAX_INPUT_LINES=  $av || shift (@ARGV); }
    else
    {
      usage();
@@ -90,11 +91,11 @@ start_time: $ts_start
 -----------
 EOX

-analyze_dump ($fnm);
+analyze_wikidata_dump ($fnm);

 exit(0);

-sub analyze_dump
+sub analyze_wikidata_dump
 {
  my $fnm= shift;


--- a/wkt1.pl
+++ b/wkt1.pl
@@ -139,6 +139,7 @@ my %ns;
 my @lines;
 my %frame;
 my @text;
+my %cnt_ATTN= 0;
 LINE: while (1)
 {
  $pos= tell(FI);
@@ -162,7 +163,7 @@ LINE: while (1)
  print ">> [$state] [$l]\n" if ($debug > 1);
  if ($state == 0)
  {
-    if ($l =~ m#^\s*<namespace key="([\-\d]+)" case="([^"]+)">([^"]+)</namespace>#)
+    if ($l =~ m#^\s*<namespace key="([\-\d]+)" case="([^"]+)">([^"]*)</namespace>#)
    {
      my $ns= { ns_id => $1, ns_name => $3, ns_case => $2 };
      $ns{$ns->{ns_id}}= $ns;
@@ -185,6 +186,7 @@ LINE: while (1)
    }
    elsif ($l =~ m#^\s*<revision>#)
    {
+      # print ">>> REVISION\n";
      $state= 2;
    }
    elsif ($l =~ m#^\s*<(title|ns|id)>([^<]+)</.+>#)
@@ -202,9 +204,15 @@ LINE: while (1)
    elsif ($l =~ m#^\s*<text xml:space="preserve">(.*)#) # TODO: check for other <text> tags
    {
      my $t= $1;
+      # print ">>> TEXT\n";
      $state= ($t =~ s#</text>##) ? 2 : 3;
      @text= ( $t );
    }
+    elsif ($l =~ m#^\s*<text(.*)>#) # TODO: check for other <text> tags
+    {
+      print "ATTN: strange text-tag: [$_]\n";
+      $cnt_ATTN++;
+    }
    elsif ($l =~ m#^\s*<(id|sha1)>([^<]+)</.+>#)
    {
      $frame{'rev_'. $1}= $2;
@@ -253,11 +261,14 @@ LINE: while (1)
  print "saving namespaces to [$fnm_ns_json]\n";
  Util::JSON::write_json_file ($fnm_ns_json, \%ns);

+  # BUG: somehow $ns{'0'} ends up as $ns{''}; the counter seems to be right ...
  my @ns= map { $ns{$_} } sort { $a <=> $b } keys %ns;
  my $csv= new Util::Simple_CSV ('separator' => "\t", 'no_array' => 1);
  $csv->define_columns (qw(ns_id use_count ns_case ns_name));
  $csv->{data}= \@ns;
  $csv->save_csv_file(filename => $fnm_ns_csv);

+  print "Attention-Count: $cnt_ATTN\n";
+
  1;
 }