From 5c7315f8470f34456f88e5b1ba09cbf7d5d55aa7 Mon Sep 17 00:00:00 2001 From: Gerhard Gonter <ggonter@gmail.com> Date: Sun, 25 Aug 2019 13:56:23 +0200 Subject: [PATCH] sort items list in order to speed up indexing --- sort_items.pl | 20 ++++++++++++++++++++ wdq0.pl | 9 +++++++-- wdq1.pl | 2 +- 3 files changed, 28 insertions(+), 3 deletions(-) create mode 100755 sort_items.pl diff --git a/sort_items.pl b/sort_items.pl new file mode 100755 index 0000000..7735000 --- /dev/null +++ b/sort_items.pl @@ -0,0 +1,20 @@ +#!/usr/bin/perl + +use strict; + +my $dir= shift(@ARGV); + +exit unless ($dir =~ m#^\d{4}-\d{2}-\d{2}[a-z]$#); + +my $cmd= sprintf ("( head -n1 data/$dir/items_unsorted.csv ; ( tail -n +2 data/$dir/items_unsorted.csv | sort '-t\t' -k6.2n ) ) > data/$dir/items.csv", $dir); + +print "cmd=[$cmd]\n"; + +my $start= scalar localtime(); +my $end= scalar localtime(); + +system ($cmd); +print "start: $start\n"; +print "end: $end\n"; + + diff --git a/wdq0.pl b/wdq0.pl index e7538da..fdb77b6 100755 --- a/wdq0.pl +++ b/wdq0.pl @@ -140,6 +140,12 @@ sub fetch_and_convert print "cmd1: [", join (' ', @cmd1), "]\n"; system (@cmd1); + my $dir= sprintf("data/%sa", $date); + + my @cmd1b= ('./sort_items.pl', $dir); + print "cmd1b: [", join (' ', @cmd1b), "]\n"; + system (@cmd1b); + notify ('wdq0: finished wdq1, starting wdq2'); my @cmd2= (qw(./wdq2.pl --scan --date), $date); print "cmd2: [", join (' ', @cmd2), "]\n"; @@ -151,9 +157,8 @@ sub fetch_and_convert system (@cmd3); notify ('wdq0: finished wdq3, starting geonames'); - my $dir= sprintf("data/%da", $date); my @cmd4= ('./geonames.pl', $dir); - print "cmd3: [", join (' ', @cmd4), "]\n"; + print "cmd4: [", join (' ', @cmd4), "]\n"; system (@cmd4); # TODO: add symlink diff --git a/wdq1.pl b/wdq1.pl index 4794d88..5bf19d2 100755 --- a/wdq1.pl +++ b/wdq1.pl @@ -151,7 +151,7 @@ sub analyze_wikidata_dump my $t_start= time(); # item list - my $fnm_items= $data_dir . '/items.csv'; + my $fnm_items= $data_dir . '/items_unsorted.csv'; local *FO_ITEMS; open (FO_ITEMS, '>:utf8', $fnm_items) or die "can't write to [$fnm_items]"; -- GitLab