fixed incorrect page_num calculation and thus index corruption

02fe41dd · Gerhard Gonter · 85a7d892 · 02fe41dd · 02fe41dd · 02fe41dd
Commit 02fe41dd authored 8 years ago by Gerhard Gonter
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ data/
 wkt-??/
 tmp/
 @*
+*.swp
 *.tys
 *.items
 items.csv
@@ -17,6 +18,7 @@ P234.csv
 P496.csv
 P625.csv
 P*.csv
+P*.tsv
 Q*
 PDS_backing.pages
 latest

--- a/lib/PDS.pm
+++ b/lib/PDS.pm
@@ -39,6 +39,8 @@ my %defaults=
    page_hits  => [], # number of times a page was loaded!
 );
+my $DEBUG= 0;
 sub new
 {
  my $class= shift;
@@ -68,9 +70,26 @@ sub new
  print "opened paging backing file [$self->{backing_file}] in mode [$bf_mode]\n";
  $self->{__FPDS__}= *FPDS;
+  $self->debug_hdr() if ($DEBUG > 0);
  $self;
 }
+sub debug_hdr
+{
+  my $self= shift;
+  print "--- 8< ---\n";
+  print "caller: ", join (' ', caller()), "\n";
+  printf ("paging: page_size=[0x%08lX] page_hdr_size=[0x%04X] rec_size=[0x%04X] recs_per_page=[0x%08lX] backing_file=[%s]\n",
+      map { $self->{$_} } qw(page_size page_hdr_size rec_size recs_per_page backing_file));
+  printf ("page_info: last_page_num=[%d] highest_page_num=[%d] last_page=[%s]\n",
+      map { $self->{$_} } qw(last_page_num highest_page_num last_page));
+  printf ("counter: page_same=[%d] page_next=[%d] page_up=[%d] page_down=[%d]\n",
+    map  { $self->{$_} } qw(cnt_page_same cnt_page_next cnt_page_up cnt_page_down));
+  print "--- >8 ---\n";
+}
 sub set
 {
  my $self= shift;
@@ -103,7 +122,8 @@ sub retrieve
  # print "pdsp: rec_num=[$rec_num] page_num=[$pdsp->{page_num}] rel_rec_num=[$rel_rec_num] rel_rec_pos=[$rel_rec_pos]\n";
  my $d= substr ($pdsp->{buffer}, $rel_rec_pos, $self->{rec_size});
-  # print "d:\n"; main::hexdump ($d);
+  print "d:\n"; main::hexdump ($d);
+  #print "buffer:\n"; main::hexdump ($pdsp->{buffer});
  $d;
 }
@@ -113,13 +133,16 @@ sub get_page_by_rec_num
  my $self= shift;
  my $rec_num= shift;
-  my ($rec_size, $last_page_num, $last_page)= map { $self->{$_} } qw(rec_size last_page_num $last_page);
+print "get_page_by_rec_num: rec_num=[$rec_num]\n" if ($DEBUG > 2);
+  my ($rec_size, $recs_per_page, $last_page_num, $last_page)= map { $self->{$_} } qw(rec_size recs_per_page last_page_num last_page);
-  my $page_num= int ($rec_num * $rec_size / $self->{page_size});
+  # my $page_num= int ($rec_num * $rec_size / $self->{page_size});
-  my $rel_rec_num= $rec_num % $self->{recs_per_page};
+  my $page_num= int ($rec_num / $recs_per_page);
+  my $rel_rec_num= $rec_num % $recs_per_page;
  my $rel_rec_pos= $self->{page_hdr_size} + $rel_rec_num * $rec_size;
+print "get_page_by_rec_num: page_num=[$page_num] rel_rec_num=[$rel_rec_num] rel_rec_pos=[$rel_rec_pos]\n" if ($DEBUG > 2);
  # print __LINE__, " rec_num=[$rec_num] page_num=[$page_num]\n";
  if ($page_num == $last_page_num)
@@ -189,8 +212,9 @@ sub print_page_info
 {
  my $self= shift;
-  print "page_size=[$self->{page_size}]\n";
+  printf ("page_size=[0x%08lX]\n", $self->{page_size});
-  print "recs_per_page=[$self->{recs_per_page}]\n";
+  printf ("rec_size=[0x%08lx]\n", $self->{rec_size});
+  printf ("recs_per_page=[0x%08lx]\n", $self->{recs_per_page});
  $self->print_page_stats();
  print "highest_page_num=[$self->{highest_page_num}]\n";
@@ -203,7 +227,8 @@ sub load_page
  my $self= shift;
  my $page_num= shift;
-  # print "loading page_num=[$page_num]\n";
+  # print '='x72, "\nloading page_num=[$page_num]\n";
+  # if (0 && $page_num >= 200) { print "EXIT at page 200!\n"; exit; }
  my $new_page=
  {
@@ -221,8 +246,9 @@ sub load_page
    local *FPDS= $self->{'__FPDS__'};
    my $page_size= $self->{page_size};
+  # $self->debug_hdr();
    my $rc= seek(FPDS, $page_pos, 0);
-    # print "seek: rc=[$rc]\n";
+    # printf ("%d seek: pos=[0x%08lX] rc=[%d]\n", __LINE__, $page_pos, $rc);
    my $new_buffer;
    my $bc= sysread(FPDS, $new_buffer, $page_size);
    unless ($bc == $page_size)
@@ -271,7 +297,7 @@ sub flush_page
  my ($page, $page_num)= map { $self->{$_} } qw(last_page last_page_num);
-  # print "flushing page_num=[$page_num]\n";
+  print '='x72, "\nflushing page_num=[$page_num]\n" if ($DEBUG > 1);
  return undef unless ($page_num >= 0 && defined ($page));
  # print "TODO: writing data page_num=[$page_num]\n";
@@ -284,8 +310,9 @@ sub flush_page
  my @d= @{$page->{dirty}};
  my $b= $page->{buffer};
-  # my $cnt_dirty= @d;
+  my $cnt_dirty= @d;
-  # print "flush: page_num=[$page_num] cnt_dirty=[$cnt_dirty]\n";
+  print "flush: page_num=[$page_num] cnt_dirty=[$cnt_dirty]\n" if ($DEBUG > 1);
+  # $self->debug_hdr();
  my $new_buffer= $self->setup_header($page_num, 0x12345678);
  # print "new_buffer length=[",length($new_buffer), "]\n";
@@ -325,8 +352,9 @@ sub flush_page
  }
  local *FPDS= $self->{'__FPDS__'};
+  # $self->debug_hdr();
  my $rc= seek(FPDS, $page->{page_pos}, 0);
-  # print "seek: rc=[$rc]\n";
+  # printf ("%d seek: pos=[0x%08lX] rc=[%d]\n", __LINE__, $page->{page_pos}, $rc);
  my $bc= syswrite(FPDS, $new_buffer, $page_size);
  unless ($bc == $page_size)
  {

--- a/wdq1.pl
+++ b/wdq1.pl
@@ -24,7 +24,7 @@ my $exp_bitmap= 0; # 1..does not work; 2..makes no sense, too sparsely populated
 # not used my $LR_max_propid= 1930; # dump from 20150608
 my $seq= 'a';
-my $date= '2016-08-16'; # maybe a config file is in order to set up the defaults...
+my $date= '2016-08-22'; # maybe a config file should be used to set up the defaults...
 my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
 my $upd_paths= 0;
@@ -173,12 +173,16 @@ my %filters=
  'P361'  => wdpf ('P361', 'part of', 1),
  'P1269' => wdpf ('P1269', 'facet of', 1),
-  # person identifiers
+  # item identifer (persons, places, etc.)
+  'P213'  => wdpf ('P213', 'ISNI'), # International Standard Name Identifier for an identity
  'P227'  => wdpf ('P227', 'GND identifier'),
+  'P244'  => wdpf ('P244', 'LCAuth ID'), # Library of Congress ID for authority control (for books use P1144)
+  'P1245' => wdpf ('P1245', 'OmegaWiki Defined Meaning'), # "Defined Meaning" on the site OmegaWiki
+  # person identifiers
  'P214'  => wdpf ('P214', 'VIAF identifier'),
  'P496'  => wdpf ('P496', 'ORCID identifier'),
+  'P2280' => wdpf ('P2280', 'Austrian Parliament ID'), # identifier for an individual, in the Austrian Parliament's "Who's Who" database
-  'P213'  => wdpf ('P213', 'ISNI'), # check
  # personal data?
  'P569'  => wdpf ('P569', 'Date of birth'),
@@ -186,17 +190,19 @@ my %filters=
  'P2298' => wdpf ('P2298', 'NSDAP membership number (1925-1945)'),
  # publications
-  'P345'  => wdpf ('P345', 'IMDb identifier'),
  'P212'  => wdpf ('P212', 'ISBN-13'),
  'P236'  => wdpf ('P212', 'ISSN'),
+  'P345'  => wdpf ('P345', 'IMDb identifier'),
+  'P356'  => wdpf ('P356', 'DOI'),
+  'P698'  => wdpf ('P698', 'PubMed ID'), # identifier for journal articles/abstracts in PubMed
  'P957'  => wdpf ('P957', 'ISBN-10'),
+  'P3035' => wdpf ('P3035', 'ISBN publisher prefix'), # ISBN publisher prefix
  # arXiv.org
  'P818'  => wdpf ('P818', 'arXiv ID'),
  'P820'  => wdpf ('P820', 'arXiv classification'),
  # permanent identifiers
-  'P356'  => wdpf ('P356',  'DOI'),
  'P1184' => wdpf ('P1184', 'Handle'),
  'P727'  => wdpf ('P727',  'Europeana ID'),
  'P1036' => wdpf ('P1036', 'Dewey Decimal Classification'),
@@ -217,8 +223,10 @@ my %filters=
  'P436'  => wdpf ('P436', 'MusicBrainz release group id'),
  'P1004' => wdpf ('P1004', 'MusicBrainz place id'),
-  # misc.
+  # Geography
  'P625'  => wdpf ('P625',  'Geo Coordinates'),
+  '1566'  => wdpf ('P1566', 'GeoNames ID'),
+  'P964'  => wdpf ('P964',  'Austrian municipality key'), # identifier for municipalities in Austria
  # chemistry
  'P233' => wdpf ('P233', 'SMILES'), # Simplified Molecular Input Line Entry Specification
@@ -240,6 +248,24 @@ my %filters=
  'P1072' => wdpf ('P1072' => 'readable file format'),
  'P1073' => wdpf ('P1073' => 'writable file format'),
  'P1195' => wdpf ('P1195' => 'file extension'),
+  # external-id
+  'P503' => wdpf ('P503' => 'ISO standard'), # number of the ISO standard which normalizes the object
+  # URLs
+  'P854' => wdpf ('P854' => 'reference URL'),
+  'P856' => wdpf ('P856' => 'official website'),
+  'P953' => wdpf ('P953' => 'full text available at'),
+  'P973' => wdpf ('P973' => 'described at URL'),
+  'P1019' => wdpf ('P1019' => 'feed URL'),
+  'P1065' => wdpf ('P1065' => 'archive URL'),
+  'P1324' => wdpf ('P1324' => 'source code repository'),
+  'P1325' => wdpf ('P1325' => 'external data available at'),
+  'P1401' => wdpf ('P1401' => 'bug tracking system'),
+  'P1581' => wdpf ('P1581' => 'official blog'),
+  'P2699' => wdpf ('P2699' => 'URL'),
+  # '' => wdpf ('' => ''),
 );
 my @filters= sort keys %filters;

--- a/wdq2.pl
+++ b/wdq2.pl
@@ -20,7 +20,7 @@ use Wiktionary::Utils;
 use PDS;
 my $seq= 'a';
-my $date= '2016-07-04';
+my $date= '2016-08-22';
 my $lang= undef;
 my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
 my $cmp_fnm_pattern= '%s/wdq%05d.cmp';
@@ -127,7 +127,7 @@ sub scan_items
  # print "index: ", Dumper ($index);
  my ($idx_id, $idx_fo_num, $idx_pos_beg, $idx_pos_end)= map { $index->{$_} } qw(id fo_count fo_pos_beg fo_pos_end);
-  print "idx_id=[$idx_id] idx_fo_num=[$idx_fo_num] idx_pos_beg=[$idx_pos_beg] idx_pos_end=[$idx_pos_end]\n";
+  # print "idx_id=[$idx_id] idx_fo_num=[$idx_fo_num] idx_pos_beg=[$idx_pos_beg] idx_pos_end=[$idx_pos_end]\n";
  my $columns= $csv->{'columns'};
  # print "columns: ", Dumper ($columns);
@@ -280,6 +280,7 @@ sub get_items
  my $cnt_items= 0;
  foreach my $rec_num (sort { $a <=> $b } @rec_nums)
  {
+  print "rec_num=[$rec_num]\n";
    my $data= $pds->retrieve ($rec_num);
    # main::hexdump ($data);
    my ($x_rec_num, $pos_idx, $f_num, $beg, $end, @x)= unpack ('LLLLLLLL', $data);
@@ -292,6 +293,7 @@ sub get_items
      fo_pos_beg => $beg,
      fo_pos_end => $end,
    };
+    print "row: ", Dumper ($row);
    if ($x_rec_num > 0)
    {