Skip to content
Snippets Groups Projects
Commit 02fe41dd authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

fixed incorrect page_num calculation and thus index corruption

parent 85a7d892
No related branches found
No related tags found
No related merge requests found
...@@ -4,6 +4,7 @@ data/ ...@@ -4,6 +4,7 @@ data/
wkt-??/ wkt-??/
tmp/ tmp/
@* @*
*.swp
*.tys *.tys
*.items *.items
items.csv items.csv
...@@ -17,6 +18,7 @@ P234.csv ...@@ -17,6 +18,7 @@ P234.csv
P496.csv P496.csv
P625.csv P625.csv
P*.csv P*.csv
P*.tsv
Q* Q*
PDS_backing.pages PDS_backing.pages
latest latest
......
...@@ -39,6 +39,8 @@ my %defaults= ...@@ -39,6 +39,8 @@ my %defaults=
page_hits => [], # number of times a page was loaded! page_hits => [], # number of times a page was loaded!
); );
my $DEBUG= 0;
sub new sub new
{ {
my $class= shift; my $class= shift;
...@@ -68,9 +70,26 @@ sub new ...@@ -68,9 +70,26 @@ sub new
print "opened paging backing file [$self->{backing_file}] in mode [$bf_mode]\n"; print "opened paging backing file [$self->{backing_file}] in mode [$bf_mode]\n";
$self->{__FPDS__}= *FPDS; $self->{__FPDS__}= *FPDS;
$self->debug_hdr() if ($DEBUG > 0);
$self; $self;
} }
sub debug_hdr
{
my $self= shift;
print "--- 8< ---\n";
print "caller: ", join (' ', caller()), "\n";
printf ("paging: page_size=[0x%08lX] page_hdr_size=[0x%04X] rec_size=[0x%04X] recs_per_page=[0x%08lX] backing_file=[%s]\n",
map { $self->{$_} } qw(page_size page_hdr_size rec_size recs_per_page backing_file));
printf ("page_info: last_page_num=[%d] highest_page_num=[%d] last_page=[%s]\n",
map { $self->{$_} } qw(last_page_num highest_page_num last_page));
printf ("counter: page_same=[%d] page_next=[%d] page_up=[%d] page_down=[%d]\n",
map { $self->{$_} } qw(cnt_page_same cnt_page_next cnt_page_up cnt_page_down));
print "--- >8 ---\n";
}
sub set sub set
{ {
my $self= shift; my $self= shift;
...@@ -103,7 +122,8 @@ sub retrieve ...@@ -103,7 +122,8 @@ sub retrieve
# print "pdsp: rec_num=[$rec_num] page_num=[$pdsp->{page_num}] rel_rec_num=[$rel_rec_num] rel_rec_pos=[$rel_rec_pos]\n"; # print "pdsp: rec_num=[$rec_num] page_num=[$pdsp->{page_num}] rel_rec_num=[$rel_rec_num] rel_rec_pos=[$rel_rec_pos]\n";
my $d= substr ($pdsp->{buffer}, $rel_rec_pos, $self->{rec_size}); my $d= substr ($pdsp->{buffer}, $rel_rec_pos, $self->{rec_size});
# print "d:\n"; main::hexdump ($d); print "d:\n"; main::hexdump ($d);
#print "buffer:\n"; main::hexdump ($pdsp->{buffer});
$d; $d;
} }
...@@ -113,13 +133,16 @@ sub get_page_by_rec_num ...@@ -113,13 +133,16 @@ sub get_page_by_rec_num
my $self= shift; my $self= shift;
my $rec_num= shift; my $rec_num= shift;
my ($rec_size, $last_page_num, $last_page)= map { $self->{$_} } qw(rec_size last_page_num $last_page); print "get_page_by_rec_num: rec_num=[$rec_num]\n" if ($DEBUG > 2);
my ($rec_size, $recs_per_page, $last_page_num, $last_page)= map { $self->{$_} } qw(rec_size recs_per_page last_page_num last_page);
my $page_num= int ($rec_num * $rec_size / $self->{page_size}); # my $page_num= int ($rec_num * $rec_size / $self->{page_size});
my $rel_rec_num= $rec_num % $self->{recs_per_page}; my $page_num= int ($rec_num / $recs_per_page);
my $rel_rec_num= $rec_num % $recs_per_page;
my $rel_rec_pos= $self->{page_hdr_size} + $rel_rec_num * $rec_size; my $rel_rec_pos= $self->{page_hdr_size} + $rel_rec_num * $rec_size;
print "get_page_by_rec_num: page_num=[$page_num] rel_rec_num=[$rel_rec_num] rel_rec_pos=[$rel_rec_pos]\n" if ($DEBUG > 2);
# print __LINE__, " rec_num=[$rec_num] page_num=[$page_num]\n"; # print __LINE__, " rec_num=[$rec_num] page_num=[$page_num]\n";
if ($page_num == $last_page_num) if ($page_num == $last_page_num)
...@@ -189,8 +212,9 @@ sub print_page_info ...@@ -189,8 +212,9 @@ sub print_page_info
{ {
my $self= shift; my $self= shift;
print "page_size=[$self->{page_size}]\n"; printf ("page_size=[0x%08lX]\n", $self->{page_size});
print "recs_per_page=[$self->{recs_per_page}]\n"; printf ("rec_size=[0x%08lx]\n", $self->{rec_size});
printf ("recs_per_page=[0x%08lx]\n", $self->{recs_per_page});
$self->print_page_stats(); $self->print_page_stats();
print "highest_page_num=[$self->{highest_page_num}]\n"; print "highest_page_num=[$self->{highest_page_num}]\n";
...@@ -203,7 +227,8 @@ sub load_page ...@@ -203,7 +227,8 @@ sub load_page
my $self= shift; my $self= shift;
my $page_num= shift; my $page_num= shift;
# print "loading page_num=[$page_num]\n"; # print '='x72, "\nloading page_num=[$page_num]\n";
# if (0 && $page_num >= 200) { print "EXIT at page 200!\n"; exit; }
my $new_page= my $new_page=
{ {
...@@ -221,8 +246,9 @@ sub load_page ...@@ -221,8 +246,9 @@ sub load_page
local *FPDS= $self->{'__FPDS__'}; local *FPDS= $self->{'__FPDS__'};
my $page_size= $self->{page_size}; my $page_size= $self->{page_size};
# $self->debug_hdr();
my $rc= seek(FPDS, $page_pos, 0); my $rc= seek(FPDS, $page_pos, 0);
# print "seek: rc=[$rc]\n"; # printf ("%d seek: pos=[0x%08lX] rc=[%d]\n", __LINE__, $page_pos, $rc);
my $new_buffer; my $new_buffer;
my $bc= sysread(FPDS, $new_buffer, $page_size); my $bc= sysread(FPDS, $new_buffer, $page_size);
unless ($bc == $page_size) unless ($bc == $page_size)
...@@ -271,7 +297,7 @@ sub flush_page ...@@ -271,7 +297,7 @@ sub flush_page
my ($page, $page_num)= map { $self->{$_} } qw(last_page last_page_num); my ($page, $page_num)= map { $self->{$_} } qw(last_page last_page_num);
# print "flushing page_num=[$page_num]\n"; print '='x72, "\nflushing page_num=[$page_num]\n" if ($DEBUG > 1);
return undef unless ($page_num >= 0 && defined ($page)); return undef unless ($page_num >= 0 && defined ($page));
# print "TODO: writing data page_num=[$page_num]\n"; # print "TODO: writing data page_num=[$page_num]\n";
...@@ -284,8 +310,9 @@ sub flush_page ...@@ -284,8 +310,9 @@ sub flush_page
my @d= @{$page->{dirty}}; my @d= @{$page->{dirty}};
my $b= $page->{buffer}; my $b= $page->{buffer};
# my $cnt_dirty= @d; my $cnt_dirty= @d;
# print "flush: page_num=[$page_num] cnt_dirty=[$cnt_dirty]\n"; print "flush: page_num=[$page_num] cnt_dirty=[$cnt_dirty]\n" if ($DEBUG > 1);
# $self->debug_hdr();
my $new_buffer= $self->setup_header($page_num, 0x12345678); my $new_buffer= $self->setup_header($page_num, 0x12345678);
# print "new_buffer length=[",length($new_buffer), "]\n"; # print "new_buffer length=[",length($new_buffer), "]\n";
...@@ -325,8 +352,9 @@ sub flush_page ...@@ -325,8 +352,9 @@ sub flush_page
} }
local *FPDS= $self->{'__FPDS__'}; local *FPDS= $self->{'__FPDS__'};
# $self->debug_hdr();
my $rc= seek(FPDS, $page->{page_pos}, 0); my $rc= seek(FPDS, $page->{page_pos}, 0);
# print "seek: rc=[$rc]\n"; # printf ("%d seek: pos=[0x%08lX] rc=[%d]\n", __LINE__, $page->{page_pos}, $rc);
my $bc= syswrite(FPDS, $new_buffer, $page_size); my $bc= syswrite(FPDS, $new_buffer, $page_size);
unless ($bc == $page_size) unless ($bc == $page_size)
{ {
......
...@@ -24,7 +24,7 @@ my $exp_bitmap= 0; # 1..does not work; 2..makes no sense, too sparsely populated ...@@ -24,7 +24,7 @@ my $exp_bitmap= 0; # 1..does not work; 2..makes no sense, too sparsely populated
# not used my $LR_max_propid= 1930; # dump from 20150608 # not used my $LR_max_propid= 1930; # dump from 20150608
my $seq= 'a'; my $seq= 'a';
my $date= '2016-08-16'; # maybe a config file is in order to set up the defaults... my $date= '2016-08-22'; # maybe a config file should be used to set up the defaults...
my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq); my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
my $upd_paths= 0; my $upd_paths= 0;
...@@ -173,12 +173,16 @@ my %filters= ...@@ -173,12 +173,16 @@ my %filters=
'P361' => wdpf ('P361', 'part of', 1), 'P361' => wdpf ('P361', 'part of', 1),
'P1269' => wdpf ('P1269', 'facet of', 1), 'P1269' => wdpf ('P1269', 'facet of', 1),
# person identifiers # item identifer (persons, places, etc.)
'P213' => wdpf ('P213', 'ISNI'), # International Standard Name Identifier for an identity
'P227' => wdpf ('P227', 'GND identifier'), 'P227' => wdpf ('P227', 'GND identifier'),
'P244' => wdpf ('P244', 'LCAuth ID'), # Library of Congress ID for authority control (for books use P1144)
'P1245' => wdpf ('P1245', 'OmegaWiki Defined Meaning'), # "Defined Meaning" on the site OmegaWiki
# person identifiers
'P214' => wdpf ('P214', 'VIAF identifier'), 'P214' => wdpf ('P214', 'VIAF identifier'),
'P496' => wdpf ('P496', 'ORCID identifier'), 'P496' => wdpf ('P496', 'ORCID identifier'),
'P2280' => wdpf ('P2280', 'Austrian Parliament ID'), # identifier for an individual, in the Austrian Parliament's "Who's Who" database
'P213' => wdpf ('P213', 'ISNI'), # check
# personal data? # personal data?
'P569' => wdpf ('P569', 'Date of birth'), 'P569' => wdpf ('P569', 'Date of birth'),
...@@ -186,17 +190,19 @@ my %filters= ...@@ -186,17 +190,19 @@ my %filters=
'P2298' => wdpf ('P2298', 'NSDAP membership number (1925-1945)'), 'P2298' => wdpf ('P2298', 'NSDAP membership number (1925-1945)'),
# publications # publications
'P345' => wdpf ('P345', 'IMDb identifier'),
'P212' => wdpf ('P212', 'ISBN-13'), 'P212' => wdpf ('P212', 'ISBN-13'),
'P236' => wdpf ('P212', 'ISSN'), 'P236' => wdpf ('P212', 'ISSN'),
'P345' => wdpf ('P345', 'IMDb identifier'),
'P356' => wdpf ('P356', 'DOI'),
'P698' => wdpf ('P698', 'PubMed ID'), # identifier for journal articles/abstracts in PubMed
'P957' => wdpf ('P957', 'ISBN-10'), 'P957' => wdpf ('P957', 'ISBN-10'),
'P3035' => wdpf ('P3035', 'ISBN publisher prefix'), # ISBN publisher prefix
# arXiv.org # arXiv.org
'P818' => wdpf ('P818', 'arXiv ID'), 'P818' => wdpf ('P818', 'arXiv ID'),
'P820' => wdpf ('P820', 'arXiv classification'), 'P820' => wdpf ('P820', 'arXiv classification'),
# permanent identifiers # permanent identifiers
'P356' => wdpf ('P356', 'DOI'),
'P1184' => wdpf ('P1184', 'Handle'), 'P1184' => wdpf ('P1184', 'Handle'),
'P727' => wdpf ('P727', 'Europeana ID'), 'P727' => wdpf ('P727', 'Europeana ID'),
'P1036' => wdpf ('P1036', 'Dewey Decimal Classification'), 'P1036' => wdpf ('P1036', 'Dewey Decimal Classification'),
...@@ -217,8 +223,10 @@ my %filters= ...@@ -217,8 +223,10 @@ my %filters=
'P436' => wdpf ('P436', 'MusicBrainz release group id'), 'P436' => wdpf ('P436', 'MusicBrainz release group id'),
'P1004' => wdpf ('P1004', 'MusicBrainz place id'), 'P1004' => wdpf ('P1004', 'MusicBrainz place id'),
# misc. # Geography
'P625' => wdpf ('P625', 'Geo Coordinates'), 'P625' => wdpf ('P625', 'Geo Coordinates'),
'1566' => wdpf ('P1566', 'GeoNames ID'),
'P964' => wdpf ('P964', 'Austrian municipality key'), # identifier for municipalities in Austria
# chemistry # chemistry
'P233' => wdpf ('P233', 'SMILES'), # Simplified Molecular Input Line Entry Specification 'P233' => wdpf ('P233', 'SMILES'), # Simplified Molecular Input Line Entry Specification
...@@ -240,6 +248,24 @@ my %filters= ...@@ -240,6 +248,24 @@ my %filters=
'P1072' => wdpf ('P1072' => 'readable file format'), 'P1072' => wdpf ('P1072' => 'readable file format'),
'P1073' => wdpf ('P1073' => 'writable file format'), 'P1073' => wdpf ('P1073' => 'writable file format'),
'P1195' => wdpf ('P1195' => 'file extension'), 'P1195' => wdpf ('P1195' => 'file extension'),
# external-id
'P503' => wdpf ('P503' => 'ISO standard'), # number of the ISO standard which normalizes the object
# URLs
'P854' => wdpf ('P854' => 'reference URL'),
'P856' => wdpf ('P856' => 'official website'),
'P953' => wdpf ('P953' => 'full text available at'),
'P973' => wdpf ('P973' => 'described at URL'),
'P1019' => wdpf ('P1019' => 'feed URL'),
'P1065' => wdpf ('P1065' => 'archive URL'),
'P1324' => wdpf ('P1324' => 'source code repository'),
'P1325' => wdpf ('P1325' => 'external data available at'),
'P1401' => wdpf ('P1401' => 'bug tracking system'),
'P1581' => wdpf ('P1581' => 'official blog'),
'P2699' => wdpf ('P2699' => 'URL'),
# '' => wdpf ('' => ''),
); );
my @filters= sort keys %filters; my @filters= sort keys %filters;
......
...@@ -20,7 +20,7 @@ use Wiktionary::Utils; ...@@ -20,7 +20,7 @@ use Wiktionary::Utils;
use PDS; use PDS;
my $seq= 'a'; my $seq= 'a';
my $date= '2016-07-04'; my $date= '2016-08-22';
my $lang= undef; my $lang= undef;
my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq); my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
my $cmp_fnm_pattern= '%s/wdq%05d.cmp'; my $cmp_fnm_pattern= '%s/wdq%05d.cmp';
...@@ -127,7 +127,7 @@ sub scan_items ...@@ -127,7 +127,7 @@ sub scan_items
# print "index: ", Dumper ($index); # print "index: ", Dumper ($index);
my ($idx_id, $idx_fo_num, $idx_pos_beg, $idx_pos_end)= map { $index->{$_} } qw(id fo_count fo_pos_beg fo_pos_end); my ($idx_id, $idx_fo_num, $idx_pos_beg, $idx_pos_end)= map { $index->{$_} } qw(id fo_count fo_pos_beg fo_pos_end);
print "idx_id=[$idx_id] idx_fo_num=[$idx_fo_num] idx_pos_beg=[$idx_pos_beg] idx_pos_end=[$idx_pos_end]\n"; # print "idx_id=[$idx_id] idx_fo_num=[$idx_fo_num] idx_pos_beg=[$idx_pos_beg] idx_pos_end=[$idx_pos_end]\n";
my $columns= $csv->{'columns'}; my $columns= $csv->{'columns'};
# print "columns: ", Dumper ($columns); # print "columns: ", Dumper ($columns);
...@@ -280,6 +280,7 @@ sub get_items ...@@ -280,6 +280,7 @@ sub get_items
my $cnt_items= 0; my $cnt_items= 0;
foreach my $rec_num (sort { $a <=> $b } @rec_nums) foreach my $rec_num (sort { $a <=> $b } @rec_nums)
{ {
print "rec_num=[$rec_num]\n";
my $data= $pds->retrieve ($rec_num); my $data= $pds->retrieve ($rec_num);
# main::hexdump ($data); # main::hexdump ($data);
my ($x_rec_num, $pos_idx, $f_num, $beg, $end, @x)= unpack ('LLLLLLLL', $data); my ($x_rec_num, $pos_idx, $f_num, $beg, $end, @x)= unpack ('LLLLLLLL', $data);
...@@ -292,6 +293,7 @@ sub get_items ...@@ -292,6 +293,7 @@ sub get_items
fo_pos_beg => $beg, fo_pos_beg => $beg,
fo_pos_end => $end, fo_pos_end => $end,
}; };
print "row: ", Dumper ($row);
if ($x_rec_num > 0) if ($x_rec_num > 0)
{ {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment