Skip to content
Snippets Groups Projects
Commit 3c2444aa authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

redesign toc structure

parent 10355f7c
No related branches found
No related tags found
No related merge requests found
......@@ -175,8 +175,9 @@ sub save
my $new_reg= shift;
my $be= $obj->{'cfg'}->{'backend'};
print "save [$new_reg] be=[$be]\n";
print main::Dumper ($new_reg);
# print "save [$new_reg] be=[$be]\n";
# print main::Dumper ($new_reg);
if ($be eq 'TA::Hasher')
{
my $id_str= $search->{$obj->{'key'}};
......@@ -210,8 +211,8 @@ sub save
}
my $j= encode_json ($all_reg);
print "fnm=[$fnm]\n";
print "generated json: [$j]\n";
# print "fnm=[$fnm]\n";
# print "generated json: [$j]\n";
open (J, '>:utf8', $fnm); print J $j; close (J);
}
elsif ($be eq 'MongoDB')
......@@ -288,13 +289,77 @@ sub load_toc_v1
$toc;
}
sub load_single_toc
{
my $reg= shift;
my $store= shift;
my $cache= shift;
my $c= $reg->{'proj_cat'};
return undef unless (defined ($c)); # not initialized?
my $f= $c . '/' . $store . '.toc.json';
my $t= TA::Util::slurp_file ($f, 'json');
if ($cache)
{
$reg->{'tocs'}->{$store}= $t;
}
$t;
}
sub load_multi_tocs
{
my $reg= shift;
my $store= shift;
my $cache= shift;
my $c= $reg->{'proj_cat'};
return undef unless (defined ($c)); # not initialized?
my @stores= (defined ($store)) ? $store : $reg->stores();
return undef unless (@stores); # return nothing if there is nothing...
my $toc= {};
foreach my $s (@stores)
{
my $f= $c . '/' . $s . '.toc.json';
my $t= TA::Util::slurp_file ($f, 'json');
if ($cache)
{
$reg->{'tocs'}->{$s}= $t;
}
foreach my $k (keys %$t)
{
my $tk= $t->{$k};
# print "k=[$k] item: ", main::Dumper ($tk);
my $r;
unless (defined ($r= $toc->{$k}))
{ # not yet present in the toc
$toc->{$k}= $r= { 'seq' => $t->{$k}->{'seq'} };
}
# print "r: ", main::Dumper ($r);
push (@{$r->{'stores'}}, { 'store' => $s, 'upd' => $tk->{'upd'} });
}
}
$toc;
}
sub verify_toc
{
my $reg= shift;
my $check_item= shift; # callback: update TOC item
my $hdr= shift || [];
my $reset= shift;
my @hdr1= qw(seq found store_count);
my @hdr1= qw(key seq found store_count);
# my @hdr1= qw(seq store_count);
my @stores= $reg->stores();
# print "stores: ", join (', ', @stores), "\n"; exit;
......@@ -302,15 +367,38 @@ sub verify_toc
#### my @extra_fields= (exists ($reg->{'toc_extra_fields'})) ? $reg->{'toc_extra_fields'} : ();
my $c= $reg->{'proj_cat'};
# get list of key to sequence mapping
my $fnm_key_seq= $c . '/KEY-SEQ.toc.json';
my $KEY_SEQ;
$KEY_SEQ= TA::Util::slurp_file ($fnm_key_seq, 'json') unless ($reset);
$KEY_SEQ= {} unless (defined $KEY_SEQ);
# pick up current tocs to see if the sequence needs to be updated
my %stores;
foreach my $s (@stores)
{
my $f= $c . '/' . $s . '.toc.json';
my $t= TA::Util::slurp_file ($f, 'json');
$t= {} unless (defined ($t)); # we need an empty toc if there is no toc yet
=begin comment
my $f= $c . '/' . $s . '.toc.json';
my $t;
$t= TA::Util::slurp_file ($f, 'json') unless ($reset);
if (defined ($t))
{
foreach my $e (@$t) { $e->{'found'}= 0; }
}
else
{
$t= []; # we need an empty toc if there is no toc yet
}
$stores{$s}= $t;
... dunno ... do we need the old toc?
=end comment
=cut
$stores{$s}= [];
}
my %items;
......@@ -329,45 +417,45 @@ sub verify_toc
print "proj_cat=[$d]\n";
find (\&item_files, $d);
my $key_seq_updated= 0;
# print "items: ", main::Dumper (\%items);
foreach my $item (keys %items)
{
my $p= $items{$item};
my $j= TA::Util::slurp_file ($p->[0], 'json');
# print "[$p->[0]] j: ", main::Dumper ($j);
my @i_stores= keys %{$j->{'store'}};
print "[$p->[0]] j: ", main::Dumper ($j);
my $key= $j->{'key'};
# print join (' ', $key, @i_stores), "\n";
my $seq= $KEY_SEQ->{$key};
unless (defined ($seq))
{
$seq= $KEY_SEQ->{$key}= $reg->next_seq();
$key_seq_updated++;
}
# search for a key's sequence number in all known stores, not only
# in those that are *currently* used for this store
my $seq;
S1: foreach my $store (@stores)
my (@i_stores, %i_stores);
E1: foreach my $jj (@{$j->{'entries'}})
{
if (exists ($stores{$store}->{$key}))
{
$seq= $stores{$store}->{$key}->{'seq'};
last S1;
}
}
my $store= $jj->{'store'};
S2: foreach my $store (@i_stores)
{
my $ster; # store's toc entry record ;)
unless (defined ($ster= $stores{$store}->{$key}))
{
$ster= $stores{$store}->{$key}=
{
'seq' => $reg->next_seq(),
'upd' => time (),
};
}
$ster->{'found'}= 1;
print join ('/', $key, $seq, $store), "\n";
my $jj= $j->{'store'}->{$store};
$ster->{'store_count'}= scalar @i_stores;
$i_stores{$store}= $jj;
push (@i_stores, $store);
my $ster=
{
'key' => $key,
'seq' => $seq,
'found' => 0, # flag that indicates if object is present (not used here?)
'upd' => time (),
};
&$check_item($j, $jj, $ster) if (defined ($check_item));
print "ster: ", main::Dumper ($ster);
push (@{$stores{$store}}, $ster);
}
}
......@@ -377,6 +465,7 @@ sub verify_toc
{
my $ss= $stores{$s};
# save TOC in json format
my $f= $c . '/' . $s . '.toc.json';
print "saving toc to [$f]\n";
unless (open (TOC, '>:utf8', $f))
......@@ -387,6 +476,7 @@ sub verify_toc
print TOC encode_json ($ss), "\n";
close (TOC);
# save TOC in CSV format
$f= $c . '/' . $s . '.toc.csv';
print "saving toc to [$f]\n";
unless (open (TOC, '>:utf8', $f))
......@@ -394,20 +484,98 @@ sub verify_toc
print STDERR "cant save toc file '$f'\n";
next;
}
print TOC join (';', 'key', @hdr1, @$hdr), "\n";
print TOC join (';', @hdr1, @$hdr), "\n";
foreach my $k (keys %$ss)
foreach my $r (@$ss)
{
my $r= $ss->{$k};
print TOC join (';', $k, map { $r->{$_} } @hdr1, @$hdr), "\n";
}
print __LINE__, " r: ", main::Dumper ($r);
print TOC join (';', map { $r->{$_} } @hdr1), ';';
if (1 || $r->{'found'})
{
print TOC join (';', map { $r->{$_} } @$hdr);
}
else
{
print TOC join (';', map { '' } @$hdr);
}
print TOC "\n";
}
close (TOC);
}
if ($key_seq_updated)
{
print "saving toc to [$fnm_key_seq]\n";
unless (open (KEY_SEQ, '>:utf8', $fnm_key_seq))
{
print STDERR "cant save toc file '$fnm_key_seq'\n";
next;
}
print KEY_SEQ encode_json ($KEY_SEQ), "\n";
close (KEY_SEQ);
}
# TODO: return something meaningful
}
sub remove_from_store
{
my $reg= shift;
my $store= shift;
my $drop_list= shift; # array ref containing entries: [ $md5, $path ]
# TODO: maybe a more universial format could be useful
my $be= $reg->{'cfg'}->{'backend'};
if ($be eq 'TA::Hasher')
{
my %drop;
foreach my $item (@$drop_list)
{
my ($id_str, $path)= @$item;
my ($r, $fnm)= $reg->ta_retrieve ($id_str);
# print "id_str=[$id_str] fnm=[$fnm] r: ", main::Dumper ($r);
next unless (defined ($r)); # this item has possibly been deleted already
my @new_entries= ();
my @dropped_entries= ();
foreach my $entry (@{$r->{'entries'}})
{
if ($entry->{'store'} eq $store && $entry->{'path'} eq $path)
{
push (@dropped_entries, $entry);
}
else
{
push (@new_entries, $entry);
}
}
$drop{$id_str}= \@dropped_entries;
if (@new_entries)
{
$r->{'entries'}= \@new_entries;
my $j= encode_json ($r);
# print "generated json: [$j]\n";
open (J, '>:utf8', $fnm); print J $j; close (J);
}
else
{
# print "nothing left to be saved; deleting file [$fnm]\n";
unlink ($fnm);
}
}
return \%drop;
}
elsif ($be eq 'MongoDB')
{
die ("implement MongoDB remove");
}
}
=head1 sequence number
=head2 $reg->next_seq()
......@@ -531,7 +699,7 @@ sub ta_match
{
next REG unless ($reg->{$k} eq $search->{$k});
}
print "found match: ", main::Dumper ($reg);
# print "found match: ", main::Dumper ($reg);
return ($reg, $i);
}
return (undef, 0);
......
......@@ -32,6 +32,7 @@ $Data::Dumper::Indent= 1;
use TA::ObjReg;
# use TA::Hasher;
# use TA::Util;
use md5cat;
my @PAR= ();
my $project;
......@@ -41,6 +42,14 @@ my $DEBUG= 0;
my $STOP= 0;
my $op_mode= 'refresh';
my @hdr= qw(md5 path mtime fs_size ino);
# --- 8< --- [from chkmd5.pl] ---
# my $Dir_Pattern= '[0-9_a-zA-Z]*';
my $Dir_Pattern= '.';
my $DEFAULT_file_list= "find $Dir_Pattern -xdev -type f -print|";
# --- >8 ---
while (my $arg= shift (@ARGV))
{
if ($arg eq '--') { push (@PAR, @ARGV); @ARGV= (); }
......@@ -96,15 +105,11 @@ if ($op_mode eq 'refresh')
}
print "store_cfg: ", Dumper ($store_cfg) if ($DEBUG);
if ($catalog->{'format'} eq 'md5cat')
{
refresh_md5cat ($objreg, $store);
}
if ($catalog->{'format'} eq 'md5cat') { refresh_md5cat ($objreg, $store); }
elsif ($catalog->{'format'} eq 'internal') { refresh_internal ($objreg, $store); }
}
elsif ($op_mode eq 'verify')
{
my @hdr= qw(path_count path mtime fs_size ino);
$objreg->verify_toc (\&verify_toc_item, \@hdr);
}
elsif ($op_mode eq 'lookup')
......@@ -158,6 +163,108 @@ sub refresh_md5cat
printf ("%6d files processed; %6d files updated\n", $cnt_processed, $cnt_updated);
}
sub refresh_internal
{
my $objreg= shift;
my $store= shift;
my %extra= @_;
my $cnt_processed= 0;
my $cnt_updated= 0;
my $cnt_dropped= 0;
$objreg->verify_toc (\&verify_toc_item, \@hdr);
my $toc= $objreg->load_single_toc ($store);
# print "toc: ", Dumper ($toc);
my $md5cat= new md5cat ();
$md5cat->read_flist ($DEFAULT_file_list);
# print "md5cat: ", Dumper ($md5cat);
# compare TOC and reference filelist
my $fl= $md5cat->{'FLIST'};
my %key= ();
foreach my $x (@$toc)
{
# print __LINE__, " k=[$k]\n";
my $k= $x->{'key'};
my $p= $x->{'path'};
$key{$k}->{$p}= 0;
if (exists ($fl->{$p}))
{
$cnt_processed++;
my $f= $fl->{$p};
my $matches= 1;
AN: foreach my $an (qw(mtime size ino))
{
unless ($f->{$an} eq $x->{$an})
{
# print "mismatch in [$an]! x: ", Dumper ($x); print "f: ", Dumper ($f);
$matches= 0;
last AN;
}
}
# print "matches: $p $matches\n";
if ($matches)
{
$f->{'state'}= 'nocheck';
$f->{'md5'}= $x->{'md5'};
}
}
else
{
# print "file missing: ", Dumper ($x);
$cnt_dropped++;
}
}
# my %paths= map { my $x= $toc->{$_}; $x->{'found'}= 0; $x->{'path'} => $x } keys %$toc;
# print "paths: ", Dumper (\%paths);
# print "fl: ", Dumper ($fl);
my $new_files= $md5cat->check_new_files ();
# print "new_files: ", Dumper ($new_files);
$md5cat->integrate_md5_sums ($new_files);
# $md5cat->save_catalog (); # TODO: if save_catalog flag is true!
# ZZZ
# update the Object registry with new items
foreach my $nf (@$new_files)
{
my ($md5, $path, $size, $mtime)= @$nf;
# print "md5=[$md5] size=[$size] path=[$path]\n";
$cnt_processed++;
my @upd= process_file ($md5, $path, $size);
$cnt_updated++ if (@upd);
}
# get filelist again after reintegration to find keys which are no longer in the catalog
$fl= $md5cat->{'FLIST'};
# print __LINE__, " fl: ", Dumper ($fl);
foreach my $p (keys %$fl)
{
$key{$fl->{$p}->{'md5'}}->{$p}= 1;
}
# print __LINE__, " key: ", Dumper (\%key);
my @drop= ();
foreach my $k (keys %key)
{
my $x1= $key{$k};
foreach my $p (keys %$x1)
{
push (@drop, [$k, $p]) if ($x1->{$p} == 0);
}
}
# print __LINE__, " drop: ", Dumper (\@drop);
$objreg->remove_from_store ($store, \@drop);
printf ("files: %6d processed; %6d updated; %6d (%d) dropped\n", $cnt_processed, $cnt_updated, $cnt_dropped, scalar (@drop));
}
sub process_file
{
my ($md5, $path, $size)= @_;
......@@ -179,7 +286,7 @@ sub process_file
my $search= { 'md5' => $md5, 'store' => $store, 'path' => $path };
my $reg= $objreg->lookup ($search);
print __LINE__, " reg: ", Dumper ($reg);
# print __LINE__, " reg: ", Dumper ($reg);
my @upd;
my $ydata; # pointer to file catalog data within main datastructure
......@@ -220,7 +327,7 @@ sub process_file
if (@upd)
{
print "saving (", join ('|', @upd), ")\n";
# print "saving (", join ('|', @upd), ")\n";
# print __LINE__, " reg: ", Dumper ($reg);
$objreg->save ($search, $reg);
}
......@@ -235,15 +342,15 @@ sub verify_toc_item
my $jj= shift; # this is just the part refering to the store currently processed
my $ster= shift; # TOC item to be updated
my @paths= keys %{$jj->{'path'}};
$ster->{'path_count'}= scalar @paths;
my $p1= shift (@paths);
my $px1= $jj->{'path'}->{$p1};
# print __LINE__, " verify_toc_item: j=", Dumper ($j);
print __LINE__, " verify_toc_item: jj=", Dumper ($jj);
# my @paths= keys %{$jj->{'path'}};
# $ster->{'path_count'}= scalar @paths; ... we don't see that this way anymore
$ster->{'path'}= $p1;
$ster->{'mtime'}= $px1->{'mtime'};
$ster->{'fs_size'}= $px1->{'fs_size'};
$ster->{'ino'}= $px1->{'ino'};
foreach my $k (qw(md5 path mtime fs_size ino))
{
$ster->{$k}= $jj->{$k};
}
}
__END__
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment