diff --git a/README.textile b/README.textile index 5256697d7ef697f35e152038cf639aae206ee3bc..11cbff38969bec30b5f5aa665d19f02e7281daea 100644 --- a/README.textile +++ b/README.textile @@ -6,10 +6,11 @@ h2. quick usage <pre> script -./wdq1.pl --date 2015-08-31 +./wdq1.pl --date 2015-08-16 +./wdq2.pl --date 2015-08-16 --scan </pre> -The script will run for several hours (2015-08-31 took 2.5 hours on my +The scripts will run for several hours (2016-08-15 took 4.5 hours on my machine), so it might be useful to record log messages into a transcript file. @@ -27,10 +28,28 @@ element by element to a series of output files. | props.csv | property catalog | | P####.csv | filtered property #### | -h3. out/wdq#####.cmp +h2. wkt1.pl + +TODO: describe ... + +h2. TODO: gnd1.pl + +TODO: write and describe ... + +h2. wdq2.pl + +Creates an index for items.csv to be able to load individual frames +from the item store and render them to STDOUT. + +TODO: +* factor out at least the rendering step into a library for other scripts + to use. + +h3. data/out/wdq#####.cmp Each item as a JSON structure is compressed individually and written to -a file with this name pattern. The positional information in the items and P-catalogs are intended for subsequent processing steps. +a file with this name pattern. The positional information in the items +and P-catalogs are intended for subsequent processing steps (see wdq2.pl). h3. CSV files diff --git a/lib/WikiData/Utils.pm b/lib/WikiData/Utils.pm index 531c9aa1ef98a3686416051e69732ef1830d0a6f..39b3f045bf35a851b504ba12810380d37c646612 100644 --- a/lib/WikiData/Utils.pm +++ b/lib/WikiData/Utils.pm @@ -3,7 +3,10 @@ package WikiData::Utils; use strict; -# TODO: make reasonable defaults and a command line option +# TODO: +# * make reasonable defaults and a command line option +# * Wiktionary::Utils is more or less the same thing with other defaults; *unify* these modules! + sub get_paths { my $date= shift; diff --git a/wdq1.pl b/wdq1.pl index 26f779b7c39d71c897f3591f91ef76318ebfbd78..a9d0da2dc7a37b1cea064a011f419acd6b0bd7e5 100755 --- a/wdq1.pl +++ b/wdq1.pl @@ -18,13 +18,13 @@ my $TSV_SEP= "\t"; # my $OUT_CHUNK_SIZE= 500_000_000; # size of files containing item data in JSON format my $OUT_CHUNK_SIZE= 640_000_000; # size of files containing item data in JSON format my $MAX_INPUT_LINES= undef; -# my $MAX_INPUT_LINES= 100_000; # for debugging to limit processing time +# my $MAX_INPUT_LINES= 100_000; # for debugging to limit processing time; TODO: add commandline option my $exp_bitmap= 0; # 1..does not work; 2..makes no sense, too sparsely populated arrays # not used my $LR_max_propid= 1930; # dump from 20150608 my $seq= 'a'; -my $date= '2015-12-28'; # maybe a config file is in order to set up the defaults... +my $date= '2016-08-16'; # maybe a config file is in order to set up the defaults... my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq); my $upd_paths= 0; @@ -46,6 +46,7 @@ while (my $arg= shift (@ARGV)) if ($an eq 'date') { $date= $av || shift (@ARGV); $upd_paths= 1; } elsif ($an eq 'seq') { $seq= $av || shift (@ARGV); $upd_paths= 1; } + elsif ($an eq 'max-lines') { $MAX_INPUT_LINES= $av || shift (@ARGV); } else { usage(); @@ -90,11 +91,11 @@ start_time: $ts_start ----------- EOX -analyze_dump ($fnm); +analyze_wikidata_dump ($fnm); exit(0); -sub analyze_dump +sub analyze_wikidata_dump { my $fnm= shift; diff --git a/wkt1.pl b/wkt1.pl index 3dee436e54616a52b362d1450cca9d6557b73b4f..49f485df93284fc3af48f90b5ec99b20f85f5e13 100755 --- a/wkt1.pl +++ b/wkt1.pl @@ -139,6 +139,7 @@ my %ns; my @lines; my %frame; my @text; +my %cnt_ATTN= 0; LINE: while (1) { $pos= tell(FI); @@ -162,7 +163,7 @@ LINE: while (1) print ">> [$state] [$l]\n" if ($debug > 1); if ($state == 0) { - if ($l =~ m#^\s*<namespace key="([\-\d]+)" case="([^"]+)">([^"]+)</namespace>#) + if ($l =~ m#^\s*<namespace key="([\-\d]+)" case="([^"]+)">([^"]*)</namespace>#) { my $ns= { ns_id => $1, ns_name => $3, ns_case => $2 }; $ns{$ns->{ns_id}}= $ns; @@ -185,6 +186,7 @@ LINE: while (1) } elsif ($l =~ m#^\s*<revision>#) { + # print ">>> REVISION\n"; $state= 2; } elsif ($l =~ m#^\s*<(title|ns|id)>([^<]+)</.+>#) @@ -202,9 +204,15 @@ LINE: while (1) elsif ($l =~ m#^\s*<text xml:space="preserve">(.*)#) # TODO: check for other <text> tags { my $t= $1; + # print ">>> TEXT\n"; $state= ($t =~ s#</text>##) ? 2 : 3; @text= ( $t ); } + elsif ($l =~ m#^\s*<text(.*)>#) # TODO: check for other <text> tags + { + print "ATTN: strange text-tag: [$_]\n"; + $cnt_ATTN++; + } elsif ($l =~ m#^\s*<(id|sha1)>([^<]+)</.+>#) { $frame{'rev_'. $1}= $2; @@ -253,11 +261,14 @@ LINE: while (1) print "saving namespaces to [$fnm_ns_json]\n"; Util::JSON::write_json_file ($fnm_ns_json, \%ns); + # BUG: somehow $ns{'0'} ends up as $ns{''}; the counter seems to be right ... my @ns= map { $ns{$_} } sort { $a <=> $b } keys %ns; my $csv= new Util::Simple_CSV ('separator' => "\t", 'no_array' => 1); $csv->define_columns (qw(ns_id use_count ns_case ns_name)); $csv->{data}= \@ns; $csv->save_csv_file(filename => $fnm_ns_csv); + print "Attention-Count: $cnt_ATTN\n"; + 1; }