diff --git a/.gitignore b/.gitignore index c00fa22e192bf18637beec19d54542c37e60499c..ce89327d28e275b44941600a2a3a780e45c7447f 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ tmp/ *.swp *.tys *.items +*.log.gz items.csv props.csv props.saved.csv diff --git a/README.textile b/README.textile index 11cbff38969bec30b5f5aa665d19f02e7281daea..e993d82fc76d48ad33e08b7e339d75b0c5a3ba9a 100644 --- a/README.textile +++ b/README.textile @@ -124,13 +124,18 @@ h3. TODO h2. Wikitionary -fetch dumps from [2] and [3] and possibly other wiktionaries +fetch dumps from [2], [3] and [4] and possibly other wiktionaries + +{en,de,nl}wiktionary-<YYYYMMDD>-pages-meta-current.xml.bz2 + +e.g. https://dumps.wikimedia.org/enwiktionary/20170501/enwiktionary-20170501-pages-meta-current.xml.bz2 h2. Links * [1] https://dumps.wikimedia.org/other/wikidata/ * [2] https://dumps.wikimedia.org/enwiktionary/ * [3] https://dumps.wikimedia.org/dewiktionary/ +* [4] https://dumps.wikimedia.org/nlwiktionary/ h3. Todo: add a way to get the proper date diff --git a/wdq0.pl b/wdq0.pl index b9deedee038cd368c44c15b5ba672eb4a2d2619f..af80ca138f54bce0a0ce4ca60e4d570f38f6e443 100755 --- a/wdq0.pl +++ b/wdq0.pl @@ -69,7 +69,7 @@ notify('starting wdq0 loop'); while (1) { - my $dumps= check(); + my $dumps= check_dump(); # print "dumps: ", Dumper ($dumps); foreach my $dump (@$dumps) { @@ -106,7 +106,7 @@ sub fetch_and_convert else { print "fetching stuff for $date\n"; - notify('wdq0: this is a test send from w4.urxn.at'); + notify("wdq0: about to fetch dump for $date"); my ($fetched, $dump_file)= fetch_dump ($date); if ($fetched) @@ -133,7 +133,7 @@ sub fetch_and_convert return undef; } - notify ('wdq0: finished download, starting wdq1'); + notify ("wdq0: finished download, size=$fetched, starting wdq1"); my @cmd1= (qw(./wdq1.pl --date), $date); print "cmd1: [", join (' ', @cmd1), "]\n"; system (@cmd1); @@ -148,6 +148,10 @@ sub fetch_and_convert print "cmd3: [", join (' ', @cmd3), "]\n"; system (@cmd3); + # TODO: add symlink + system (qw(rm data/latest)); + system ('ln', '-s', join ('', $date, $seq), 'data/latest'); + notify ('wdq0: finished wikidata conversion'); } @@ -162,7 +166,7 @@ sub fetch_dump my $dump_file= $d.'.json.gz'; my $l_dump_file= 'dumps/'. $dump_file; - print "dump_file=[$dump_file] l_dump_file=[$l_dump_file]\n"; + print __LINE__, " dump_file=[$dump_file] l_dump_file=[$l_dump_file]\n"; unless (-f $l_dump_file) { @@ -170,7 +174,7 @@ sub fetch_dump my @cmd_fetch= ($wget, $dump_url, '-O'.$l_dump_file); print "cmd_fetch: [", join (' ', @cmd_fetch), "]\n"; # return undef; - # system (@cmd_fetch); + system (@cmd_fetch); } my @st= stat ($l_dump_file); @@ -183,7 +187,7 @@ sub fetch_dump ($fetched, $dump_file); } -sub check +sub check_dump { my $cmd_fetch= "$wget $dumps_source -O-"; diff --git a/wdq1.pl b/wdq1.pl index 8f61b88881572f48c79e3b286977f38d94e93af6..2e47fb49c0f6260b0dc3c888dd701c828fe98dcd 100755 --- a/wdq1.pl +++ b/wdq1.pl @@ -32,7 +32,7 @@ my $date= '2016-12-19'; # maybe a config file should be used to set up the defau my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq); my $upd_paths= 0; -my @langs= qw(en de it fr); +my @langs= qw(en de it fr nl); my $fo_compress= 2; # 0..don't compress at all @@ -179,6 +179,7 @@ my %filters= 'P360' => wdpf ('P360', 'is a list of', 1), 'P361' => wdpf ('P361', 'part of', 1), 'P1269' => wdpf ('P1269', 'facet of', 1), + 'P2429' => wdpf ('P2429', 'label_en | expected completeness', 1), # describes whether a property is intended to represent a complete set of real-world items having that property # item identifer (persons, places, etc.) 'P213' => wdpf ('P213', 'ISNI'), # International Standard Name Identifier for an identity