diff --git a/sort_items.pl b/sort_items.pl new file mode 100755 index 0000000000000000000000000000000000000000..77350003169037e7867305e7a9559ad7ec5b17b9 --- /dev/null +++ b/sort_items.pl @@ -0,0 +1,20 @@ +#!/usr/bin/perl + +use strict; + +my $dir= shift(@ARGV); + +exit unless ($dir =~ m#^\d{4}-\d{2}-\d{2}[a-z]$#); + +my $cmd= sprintf ("( head -n1 data/$dir/items_unsorted.csv ; ( tail -n +2 data/$dir/items_unsorted.csv | sort '-t\t' -k6.2n ) ) > data/$dir/items.csv", $dir); + +print "cmd=[$cmd]\n"; + +my $start= scalar localtime(); +my $end= scalar localtime(); + +system ($cmd); +print "start: $start\n"; +print "end: $end\n"; + + diff --git a/wdq0.pl b/wdq0.pl index e7538da6b2dccc188dbe841d949731293a822bd0..fdb77b630588aab39657d9848c278e5500ba2eb8 100755 --- a/wdq0.pl +++ b/wdq0.pl @@ -140,6 +140,12 @@ sub fetch_and_convert print "cmd1: [", join (' ', @cmd1), "]\n"; system (@cmd1); + my $dir= sprintf("data/%sa", $date); + + my @cmd1b= ('./sort_items.pl', $dir); + print "cmd1b: [", join (' ', @cmd1b), "]\n"; + system (@cmd1b); + notify ('wdq0: finished wdq1, starting wdq2'); my @cmd2= (qw(./wdq2.pl --scan --date), $date); print "cmd2: [", join (' ', @cmd2), "]\n"; @@ -151,9 +157,8 @@ sub fetch_and_convert system (@cmd3); notify ('wdq0: finished wdq3, starting geonames'); - my $dir= sprintf("data/%da", $date); my @cmd4= ('./geonames.pl', $dir); - print "cmd3: [", join (' ', @cmd4), "]\n"; + print "cmd4: [", join (' ', @cmd4), "]\n"; system (@cmd4); # TODO: add symlink diff --git a/wdq1.pl b/wdq1.pl index 4794d88561a86b6de06af322933e687d20d7bd0f..5bf19d22dc42066ea44004c8381c58c299c0553c 100755 --- a/wdq1.pl +++ b/wdq1.pl @@ -151,7 +151,7 @@ sub analyze_wikidata_dump my $t_start= time(); # item list - my $fnm_items= $data_dir . '/items.csv'; + my $fnm_items= $data_dir . '/items_unsorted.csv'; local *FO_ITEMS; open (FO_ITEMS, '>:utf8', $fnm_items) or die "can't write to [$fnm_items]";