Skip to content
Snippets Groups Projects
Commit f2e76cba authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

small uncommitted changes

parent e4f86684
No related branches found
No related tags found
No related merge requests found
......@@ -7,6 +7,7 @@ tmp/
*.swp
*.tys
*.items
*.log.gz
items.csv
props.csv
props.saved.csv
......
......@@ -124,13 +124,18 @@ h3. TODO
h2. Wikitionary
fetch dumps from [2] and [3] and possibly other wiktionaries
fetch dumps from [2], [3] and [4] and possibly other wiktionaries
{en,de,nl}wiktionary-<YYYYMMDD>-pages-meta-current.xml.bz2
e.g. https://dumps.wikimedia.org/enwiktionary/20170501/enwiktionary-20170501-pages-meta-current.xml.bz2
h2. Links
* [1] https://dumps.wikimedia.org/other/wikidata/
* [2] https://dumps.wikimedia.org/enwiktionary/
* [3] https://dumps.wikimedia.org/dewiktionary/
* [4] https://dumps.wikimedia.org/nlwiktionary/
h3. Todo: add a way to get the proper date
......
......@@ -69,7 +69,7 @@ notify('starting wdq0 loop');
while (1)
{
my $dumps= check();
my $dumps= check_dump();
# print "dumps: ", Dumper ($dumps);
foreach my $dump (@$dumps)
{
......@@ -106,7 +106,7 @@ sub fetch_and_convert
else
{
print "fetching stuff for $date\n";
notify('wdq0: this is a test send from w4.urxn.at');
notify("wdq0: about to fetch dump for $date");
my ($fetched, $dump_file)= fetch_dump ($date);
if ($fetched)
......@@ -133,7 +133,7 @@ sub fetch_and_convert
return undef;
}
notify ('wdq0: finished download, starting wdq1');
notify ("wdq0: finished download, size=$fetched, starting wdq1");
my @cmd1= (qw(./wdq1.pl --date), $date);
print "cmd1: [", join (' ', @cmd1), "]\n";
system (@cmd1);
......@@ -148,6 +148,10 @@ sub fetch_and_convert
print "cmd3: [", join (' ', @cmd3), "]\n";
system (@cmd3);
# TODO: add symlink
system (qw(rm data/latest));
system ('ln', '-s', join ('', $date, $seq), 'data/latest');
notify ('wdq0: finished wikidata conversion');
}
......@@ -162,7 +166,7 @@ sub fetch_dump
my $dump_file= $d.'.json.gz';
my $l_dump_file= 'dumps/'. $dump_file;
print "dump_file=[$dump_file] l_dump_file=[$l_dump_file]\n";
print __LINE__, " dump_file=[$dump_file] l_dump_file=[$l_dump_file]\n";
unless (-f $l_dump_file)
{
......@@ -170,7 +174,7 @@ sub fetch_dump
my @cmd_fetch= ($wget, $dump_url, '-O'.$l_dump_file);
print "cmd_fetch: [", join (' ', @cmd_fetch), "]\n";
# return undef;
# system (@cmd_fetch);
system (@cmd_fetch);
}
my @st= stat ($l_dump_file);
......@@ -183,7 +187,7 @@ sub fetch_dump
($fetched, $dump_file);
}
sub check
sub check_dump
{
my $cmd_fetch= "$wget $dumps_source -O-";
......
......@@ -32,7 +32,7 @@ my $date= '2016-12-19'; # maybe a config file should be used to set up the defau
my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
my $upd_paths= 0;
my @langs= qw(en de it fr);
my @langs= qw(en de it fr nl);
my $fo_compress= 2;
# 0..don't compress at all
......@@ -179,6 +179,7 @@ my %filters=
'P360' => wdpf ('P360', 'is a list of', 1),
'P361' => wdpf ('P361', 'part of', 1),
'P1269' => wdpf ('P1269', 'facet of', 1),
'P2429' => wdpf ('P2429', 'label_en | expected completeness', 1), # describes whether a property is intended to represent a complete set of real-world items having that property
# item identifer (persons, places, etc.)
'P213' => wdpf ('P213', 'ISNI'), # International Standard Name Identifier for an identity
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment