From 5c7315f8470f34456f88e5b1ba09cbf7d5d55aa7 Mon Sep 17 00:00:00 2001
From: Gerhard Gonter <ggonter@gmail.com>
Date: Sun, 25 Aug 2019 13:56:23 +0200
Subject: [PATCH] sort items list in order to speed up indexing

---
 sort_items.pl | 20 ++++++++++++++++++++
 wdq0.pl       |  9 +++++++--
 wdq1.pl       |  2 +-
 3 files changed, 28 insertions(+), 3 deletions(-)
 create mode 100755 sort_items.pl

diff --git a/sort_items.pl b/sort_items.pl
new file mode 100755
index 0000000..7735000
--- /dev/null
+++ b/sort_items.pl
@@ -0,0 +1,20 @@
+#!/usr/bin/perl
+
+use strict;
+
+my $dir= shift(@ARGV);
+
+exit unless ($dir =~ m#^\d{4}-\d{2}-\d{2}[a-z]$#);
+
+my $cmd= sprintf ("( head -n1 data/$dir/items_unsorted.csv ; ( tail -n +2 data/$dir/items_unsorted.csv | sort '-t\t' -k6.2n ) ) > data/$dir/items.csv", $dir);
+
+print "cmd=[$cmd]\n";
+
+my $start= scalar localtime();
+my $end=   scalar localtime();
+
+system ($cmd);
+print "start: $start\n";
+print "end:   $end\n";
+
+
diff --git a/wdq0.pl b/wdq0.pl
index e7538da..fdb77b6 100755
--- a/wdq0.pl
+++ b/wdq0.pl
@@ -140,6 +140,12 @@ sub fetch_and_convert
     print "cmd1: [", join (' ', @cmd1), "]\n";
     system (@cmd1);
 
+    my $dir= sprintf("data/%sa", $date);
+
+    my @cmd1b= ('./sort_items.pl', $dir);
+    print "cmd1b: [", join (' ', @cmd1b), "]\n";
+    system (@cmd1b);
+
     notify ('wdq0: finished wdq1, starting wdq2');
     my @cmd2= (qw(./wdq2.pl --scan --date), $date);
     print "cmd2: [", join (' ', @cmd2), "]\n";
@@ -151,9 +157,8 @@ sub fetch_and_convert
     system (@cmd3);
 
     notify ('wdq0: finished wdq3, starting geonames');
-    my $dir= sprintf("data/%da", $date);
     my @cmd4= ('./geonames.pl', $dir);
-    print "cmd3: [", join (' ', @cmd4), "]\n";
+    print "cmd4: [", join (' ', @cmd4), "]\n";
     system (@cmd4);
 
     # TODO: add symlink
diff --git a/wdq1.pl b/wdq1.pl
index 4794d88..5bf19d2 100755
--- a/wdq1.pl
+++ b/wdq1.pl
@@ -151,7 +151,7 @@ sub analyze_wikidata_dump
   my $t_start= time();
 
   # item list
-  my $fnm_items= $data_dir . '/items.csv';
+  my $fnm_items= $data_dir . '/items_unsorted.csv';
 
   local *FO_ITEMS;
   open (FO_ITEMS, '>:utf8', $fnm_items) or die "can't write to [$fnm_items]";
-- 
GitLab