Skip to content
Snippets Groups Projects
Commit 988163ac authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

updated notes

parent d411e2ef
No related branches found
No related tags found
No related merge requests found
......@@ -6,10 +6,11 @@ h2. quick usage
<pre>
script
./wdq1.pl --date 2015-08-31
./wdq1.pl --date 2015-08-16
./wdq2.pl --date 2015-08-16 --scan
</pre>
The script will run for several hours (2015-08-31 took 2.5 hours on my
The scripts will run for several hours (2016-08-15 took 4.5 hours on my
machine), so it might be useful to record log messages into a transcript
file.
......@@ -27,10 +28,28 @@ element by element to a series of output files.
| props.csv | property catalog |
| P####.csv | filtered property #### |
h3. out/wdq#####.cmp
h2. wkt1.pl
TODO: describe ...
h2. TODO: gnd1.pl
TODO: write and describe ...
h2. wdq2.pl
Creates an index for items.csv to be able to load individual frames
from the item store and render them to STDOUT.
TODO:
* factor out at least the rendering step into a library for other scripts
to use.
h3. data/out/wdq#####.cmp
Each item as a JSON structure is compressed individually and written to
a file with this name pattern. The positional information in the items and P-catalogs are intended for subsequent processing steps.
a file with this name pattern. The positional information in the items
and P-catalogs are intended for subsequent processing steps (see wdq2.pl).
h3. CSV files
......
......@@ -3,7 +3,10 @@ package WikiData::Utils;
use strict;
# TODO: make reasonable defaults and a command line option
# TODO:
# * make reasonable defaults and a command line option
# * Wiktionary::Utils is more or less the same thing with other defaults; *unify* these modules!
sub get_paths
{
my $date= shift;
......
......@@ -18,13 +18,13 @@ my $TSV_SEP= "\t";
# my $OUT_CHUNK_SIZE= 500_000_000; # size of files containing item data in JSON format
my $OUT_CHUNK_SIZE= 640_000_000; # size of files containing item data in JSON format
my $MAX_INPUT_LINES= undef;
# my $MAX_INPUT_LINES= 100_000; # for debugging to limit processing time
# my $MAX_INPUT_LINES= 100_000; # for debugging to limit processing time; TODO: add commandline option
my $exp_bitmap= 0; # 1..does not work; 2..makes no sense, too sparsely populated arrays
# not used my $LR_max_propid= 1930; # dump from 20150608
my $seq= 'a';
my $date= '2015-12-28'; # maybe a config file is in order to set up the defaults...
my $date= '2016-08-16'; # maybe a config file is in order to set up the defaults...
my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
my $upd_paths= 0;
......@@ -46,6 +46,7 @@ while (my $arg= shift (@ARGV))
if ($an eq 'date') { $date= $av || shift (@ARGV); $upd_paths= 1; }
elsif ($an eq 'seq') { $seq= $av || shift (@ARGV); $upd_paths= 1; }
elsif ($an eq 'max-lines') { $MAX_INPUT_LINES= $av || shift (@ARGV); }
else
{
usage();
......@@ -90,11 +91,11 @@ start_time: $ts_start
-----------
EOX
analyze_dump ($fnm);
analyze_wikidata_dump ($fnm);
exit(0);
sub analyze_dump
sub analyze_wikidata_dump
{
my $fnm= shift;
......
......@@ -139,6 +139,7 @@ my %ns;
my @lines;
my %frame;
my @text;
my %cnt_ATTN= 0;
LINE: while (1)
{
$pos= tell(FI);
......@@ -162,7 +163,7 @@ LINE: while (1)
print ">> [$state] [$l]\n" if ($debug > 1);
if ($state == 0)
{
if ($l =~ m#^\s*<namespace key="([\-\d]+)" case="([^"]+)">([^"]+)</namespace>#)
if ($l =~ m#^\s*<namespace key="([\-\d]+)" case="([^"]+)">([^"]*)</namespace>#)
{
my $ns= { ns_id => $1, ns_name => $3, ns_case => $2 };
$ns{$ns->{ns_id}}= $ns;
......@@ -185,6 +186,7 @@ LINE: while (1)
}
elsif ($l =~ m#^\s*<revision>#)
{
# print ">>> REVISION\n";
$state= 2;
}
elsif ($l =~ m#^\s*<(title|ns|id)>([^<]+)</.+>#)
......@@ -202,9 +204,15 @@ LINE: while (1)
elsif ($l =~ m#^\s*<text xml:space="preserve">(.*)#) # TODO: check for other <text> tags
{
my $t= $1;
# print ">>> TEXT\n";
$state= ($t =~ s#</text>##) ? 2 : 3;
@text= ( $t );
}
elsif ($l =~ m#^\s*<text(.*)>#) # TODO: check for other <text> tags
{
print "ATTN: strange text-tag: [$_]\n";
$cnt_ATTN++;
}
elsif ($l =~ m#^\s*<(id|sha1)>([^<]+)</.+>#)
{
$frame{'rev_'. $1}= $2;
......@@ -253,11 +261,14 @@ LINE: while (1)
print "saving namespaces to [$fnm_ns_json]\n";
Util::JSON::write_json_file ($fnm_ns_json, \%ns);
# BUG: somehow $ns{'0'} ends up as $ns{''}; the counter seems to be right ...
my @ns= map { $ns{$_} } sort { $a <=> $b } keys %ns;
my $csv= new Util::Simple_CSV ('separator' => "\t", 'no_array' => 1);
$csv->define_columns (qw(ns_id use_count ns_case ns_name));
$csv->{data}= \@ns;
$csv->save_csv_file(filename => $fnm_ns_csv);
print "Attention-Count: $cnt_ATTN\n";
1;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment