Skip to content
Snippets Groups Projects
Commit 02fe41dd authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

fixed incorrect page_num calculation and thus index corruption

parent 85a7d892
No related branches found
No related tags found
No related merge requests found
......@@ -4,6 +4,7 @@ data/
wkt-??/
tmp/
@*
*.swp
*.tys
*.items
items.csv
......@@ -17,6 +18,7 @@ P234.csv
P496.csv
P625.csv
P*.csv
P*.tsv
Q*
PDS_backing.pages
latest
......
......
......@@ -39,6 +39,8 @@ my %defaults=
page_hits => [], # number of times a page was loaded!
);
my $DEBUG= 0;
sub new
{
my $class= shift;
......@@ -68,9 +70,26 @@ sub new
print "opened paging backing file [$self->{backing_file}] in mode [$bf_mode]\n";
$self->{__FPDS__}= *FPDS;
$self->debug_hdr() if ($DEBUG > 0);
$self;
}
sub debug_hdr
{
my $self= shift;
print "--- 8< ---\n";
print "caller: ", join (' ', caller()), "\n";
printf ("paging: page_size=[0x%08lX] page_hdr_size=[0x%04X] rec_size=[0x%04X] recs_per_page=[0x%08lX] backing_file=[%s]\n",
map { $self->{$_} } qw(page_size page_hdr_size rec_size recs_per_page backing_file));
printf ("page_info: last_page_num=[%d] highest_page_num=[%d] last_page=[%s]\n",
map { $self->{$_} } qw(last_page_num highest_page_num last_page));
printf ("counter: page_same=[%d] page_next=[%d] page_up=[%d] page_down=[%d]\n",
map { $self->{$_} } qw(cnt_page_same cnt_page_next cnt_page_up cnt_page_down));
print "--- >8 ---\n";
}
sub set
{
my $self= shift;
......@@ -103,7 +122,8 @@ sub retrieve
# print "pdsp: rec_num=[$rec_num] page_num=[$pdsp->{page_num}] rel_rec_num=[$rel_rec_num] rel_rec_pos=[$rel_rec_pos]\n";
my $d= substr ($pdsp->{buffer}, $rel_rec_pos, $self->{rec_size});
# print "d:\n"; main::hexdump ($d);
print "d:\n"; main::hexdump ($d);
#print "buffer:\n"; main::hexdump ($pdsp->{buffer});
$d;
}
......@@ -113,13 +133,16 @@ sub get_page_by_rec_num
my $self= shift;
my $rec_num= shift;
my ($rec_size, $last_page_num, $last_page)= map { $self->{$_} } qw(rec_size last_page_num $last_page);
print "get_page_by_rec_num: rec_num=[$rec_num]\n" if ($DEBUG > 2);
my ($rec_size, $recs_per_page, $last_page_num, $last_page)= map { $self->{$_} } qw(rec_size recs_per_page last_page_num last_page);
my $page_num= int ($rec_num * $rec_size / $self->{page_size});
my $rel_rec_num= $rec_num % $self->{recs_per_page};
# my $page_num= int ($rec_num * $rec_size / $self->{page_size});
my $page_num= int ($rec_num / $recs_per_page);
my $rel_rec_num= $rec_num % $recs_per_page;
my $rel_rec_pos= $self->{page_hdr_size} + $rel_rec_num * $rec_size;
print "get_page_by_rec_num: page_num=[$page_num] rel_rec_num=[$rel_rec_num] rel_rec_pos=[$rel_rec_pos]\n" if ($DEBUG > 2);
# print __LINE__, " rec_num=[$rec_num] page_num=[$page_num]\n";
if ($page_num == $last_page_num)
......@@ -189,8 +212,9 @@ sub print_page_info
{
my $self= shift;
print "page_size=[$self->{page_size}]\n";
print "recs_per_page=[$self->{recs_per_page}]\n";
printf ("page_size=[0x%08lX]\n", $self->{page_size});
printf ("rec_size=[0x%08lx]\n", $self->{rec_size});
printf ("recs_per_page=[0x%08lx]\n", $self->{recs_per_page});
$self->print_page_stats();
print "highest_page_num=[$self->{highest_page_num}]\n";
......@@ -203,7 +227,8 @@ sub load_page
my $self= shift;
my $page_num= shift;
# print "loading page_num=[$page_num]\n";
# print '='x72, "\nloading page_num=[$page_num]\n";
# if (0 && $page_num >= 200) { print "EXIT at page 200!\n"; exit; }
my $new_page=
{
......@@ -221,8 +246,9 @@ sub load_page
local *FPDS= $self->{'__FPDS__'};
my $page_size= $self->{page_size};
# $self->debug_hdr();
my $rc= seek(FPDS, $page_pos, 0);
# print "seek: rc=[$rc]\n";
# printf ("%d seek: pos=[0x%08lX] rc=[%d]\n", __LINE__, $page_pos, $rc);
my $new_buffer;
my $bc= sysread(FPDS, $new_buffer, $page_size);
unless ($bc == $page_size)
......@@ -271,7 +297,7 @@ sub flush_page
my ($page, $page_num)= map { $self->{$_} } qw(last_page last_page_num);
# print "flushing page_num=[$page_num]\n";
print '='x72, "\nflushing page_num=[$page_num]\n" if ($DEBUG > 1);
return undef unless ($page_num >= 0 && defined ($page));
# print "TODO: writing data page_num=[$page_num]\n";
......@@ -284,8 +310,9 @@ sub flush_page
my @d= @{$page->{dirty}};
my $b= $page->{buffer};
# my $cnt_dirty= @d;
# print "flush: page_num=[$page_num] cnt_dirty=[$cnt_dirty]\n";
my $cnt_dirty= @d;
print "flush: page_num=[$page_num] cnt_dirty=[$cnt_dirty]\n" if ($DEBUG > 1);
# $self->debug_hdr();
my $new_buffer= $self->setup_header($page_num, 0x12345678);
# print "new_buffer length=[",length($new_buffer), "]\n";
......@@ -325,8 +352,9 @@ sub flush_page
}
local *FPDS= $self->{'__FPDS__'};
# $self->debug_hdr();
my $rc= seek(FPDS, $page->{page_pos}, 0);
# print "seek: rc=[$rc]\n";
# printf ("%d seek: pos=[0x%08lX] rc=[%d]\n", __LINE__, $page->{page_pos}, $rc);
my $bc= syswrite(FPDS, $new_buffer, $page_size);
unless ($bc == $page_size)
{
......
......
......@@ -24,7 +24,7 @@ my $exp_bitmap= 0; # 1..does not work; 2..makes no sense, too sparsely populated
# not used my $LR_max_propid= 1930; # dump from 20150608
my $seq= 'a';
my $date= '2016-08-16'; # maybe a config file is in order to set up the defaults...
my $date= '2016-08-22'; # maybe a config file should be used to set up the defaults...
my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
my $upd_paths= 0;
......@@ -173,12 +173,16 @@ my %filters=
'P361' => wdpf ('P361', 'part of', 1),
'P1269' => wdpf ('P1269', 'facet of', 1),
# person identifiers
# item identifer (persons, places, etc.)
'P213' => wdpf ('P213', 'ISNI'), # International Standard Name Identifier for an identity
'P227' => wdpf ('P227', 'GND identifier'),
'P244' => wdpf ('P244', 'LCAuth ID'), # Library of Congress ID for authority control (for books use P1144)
'P1245' => wdpf ('P1245', 'OmegaWiki Defined Meaning'), # "Defined Meaning" on the site OmegaWiki
# person identifiers
'P214' => wdpf ('P214', 'VIAF identifier'),
'P496' => wdpf ('P496', 'ORCID identifier'),
'P213' => wdpf ('P213', 'ISNI'), # check
'P2280' => wdpf ('P2280', 'Austrian Parliament ID'), # identifier for an individual, in the Austrian Parliament's "Who's Who" database
# personal data?
'P569' => wdpf ('P569', 'Date of birth'),
......@@ -186,17 +190,19 @@ my %filters=
'P2298' => wdpf ('P2298', 'NSDAP membership number (1925-1945)'),
# publications
'P345' => wdpf ('P345', 'IMDb identifier'),
'P212' => wdpf ('P212', 'ISBN-13'),
'P236' => wdpf ('P212', 'ISSN'),
'P345' => wdpf ('P345', 'IMDb identifier'),
'P356' => wdpf ('P356', 'DOI'),
'P698' => wdpf ('P698', 'PubMed ID'), # identifier for journal articles/abstracts in PubMed
'P957' => wdpf ('P957', 'ISBN-10'),
'P3035' => wdpf ('P3035', 'ISBN publisher prefix'), # ISBN publisher prefix
# arXiv.org
'P818' => wdpf ('P818', 'arXiv ID'),
'P820' => wdpf ('P820', 'arXiv classification'),
# permanent identifiers
'P356' => wdpf ('P356', 'DOI'),
'P1184' => wdpf ('P1184', 'Handle'),
'P727' => wdpf ('P727', 'Europeana ID'),
'P1036' => wdpf ('P1036', 'Dewey Decimal Classification'),
......@@ -217,8 +223,10 @@ my %filters=
'P436' => wdpf ('P436', 'MusicBrainz release group id'),
'P1004' => wdpf ('P1004', 'MusicBrainz place id'),
# misc.
# Geography
'P625' => wdpf ('P625', 'Geo Coordinates'),
'1566' => wdpf ('P1566', 'GeoNames ID'),
'P964' => wdpf ('P964', 'Austrian municipality key'), # identifier for municipalities in Austria
# chemistry
'P233' => wdpf ('P233', 'SMILES'), # Simplified Molecular Input Line Entry Specification
......@@ -240,6 +248,24 @@ my %filters=
'P1072' => wdpf ('P1072' => 'readable file format'),
'P1073' => wdpf ('P1073' => 'writable file format'),
'P1195' => wdpf ('P1195' => 'file extension'),
# external-id
'P503' => wdpf ('P503' => 'ISO standard'), # number of the ISO standard which normalizes the object
# URLs
'P854' => wdpf ('P854' => 'reference URL'),
'P856' => wdpf ('P856' => 'official website'),
'P953' => wdpf ('P953' => 'full text available at'),
'P973' => wdpf ('P973' => 'described at URL'),
'P1019' => wdpf ('P1019' => 'feed URL'),
'P1065' => wdpf ('P1065' => 'archive URL'),
'P1324' => wdpf ('P1324' => 'source code repository'),
'P1325' => wdpf ('P1325' => 'external data available at'),
'P1401' => wdpf ('P1401' => 'bug tracking system'),
'P1581' => wdpf ('P1581' => 'official blog'),
'P2699' => wdpf ('P2699' => 'URL'),
# '' => wdpf ('' => ''),
);
my @filters= sort keys %filters;
......
......
......@@ -20,7 +20,7 @@ use Wiktionary::Utils;
use PDS;
my $seq= 'a';
my $date= '2016-07-04';
my $date= '2016-08-22';
my $lang= undef;
my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
my $cmp_fnm_pattern= '%s/wdq%05d.cmp';
......@@ -127,7 +127,7 @@ sub scan_items
# print "index: ", Dumper ($index);
my ($idx_id, $idx_fo_num, $idx_pos_beg, $idx_pos_end)= map { $index->{$_} } qw(id fo_count fo_pos_beg fo_pos_end);
print "idx_id=[$idx_id] idx_fo_num=[$idx_fo_num] idx_pos_beg=[$idx_pos_beg] idx_pos_end=[$idx_pos_end]\n";
# print "idx_id=[$idx_id] idx_fo_num=[$idx_fo_num] idx_pos_beg=[$idx_pos_beg] idx_pos_end=[$idx_pos_end]\n";
my $columns= $csv->{'columns'};
# print "columns: ", Dumper ($columns);
......@@ -280,6 +280,7 @@ sub get_items
my $cnt_items= 0;
foreach my $rec_num (sort { $a <=> $b } @rec_nums)
{
print "rec_num=[$rec_num]\n";
my $data= $pds->retrieve ($rec_num);
# main::hexdump ($data);
my ($x_rec_num, $pos_idx, $f_num, $beg, $end, @x)= unpack ('LLLLLLLL', $data);
......@@ -292,6 +293,7 @@ sub get_items
fo_pos_beg => $beg,
fo_pos_end => $end,
};
print "row: ", Dumper ($row);
if ($x_rec_num > 0)
{
......
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment