diff --git a/textarchive/lib/TA/ObjReg.pm b/textarchive/lib/TA/ObjReg.pm index 114cd92d9d3ef7962557cd832ff3b16c8a803ac4..1724da971e409612d91c4be22b8dc95b995abc72 100644 --- a/textarchive/lib/TA/ObjReg.pm +++ b/textarchive/lib/TA/ObjReg.pm @@ -175,8 +175,9 @@ sub save my $new_reg= shift; my $be= $obj->{'cfg'}->{'backend'}; - print "save [$new_reg] be=[$be]\n"; - print main::Dumper ($new_reg); + # print "save [$new_reg] be=[$be]\n"; + # print main::Dumper ($new_reg); + if ($be eq 'TA::Hasher') { my $id_str= $search->{$obj->{'key'}}; @@ -210,8 +211,8 @@ sub save } my $j= encode_json ($all_reg); - print "fnm=[$fnm]\n"; - print "generated json: [$j]\n"; + # print "fnm=[$fnm]\n"; + # print "generated json: [$j]\n"; open (J, '>:utf8', $fnm); print J $j; close (J); } elsif ($be eq 'MongoDB') @@ -288,13 +289,77 @@ sub load_toc_v1 $toc; } +sub load_single_toc +{ + my $reg= shift; + my $store= shift; + my $cache= shift; + + my $c= $reg->{'proj_cat'}; + return undef unless (defined ($c)); # not initialized? + + my $f= $c . '/' . $store . '.toc.json'; + my $t= TA::Util::slurp_file ($f, 'json'); + if ($cache) + { + $reg->{'tocs'}->{$store}= $t; + } + + $t; +} + +sub load_multi_tocs +{ + my $reg= shift; + my $store= shift; + my $cache= shift; + + my $c= $reg->{'proj_cat'}; + return undef unless (defined ($c)); # not initialized? + + my @stores= (defined ($store)) ? $store : $reg->stores(); + + return undef unless (@stores); # return nothing if there is nothing... + + my $toc= {}; + foreach my $s (@stores) + { + my $f= $c . '/' . $s . '.toc.json'; + my $t= TA::Util::slurp_file ($f, 'json'); + if ($cache) + { + $reg->{'tocs'}->{$s}= $t; + } + + foreach my $k (keys %$t) + { + my $tk= $t->{$k}; +# print "k=[$k] item: ", main::Dumper ($tk); + my $r; + + unless (defined ($r= $toc->{$k})) + { # not yet present in the toc + $toc->{$k}= $r= { 'seq' => $t->{$k}->{'seq'} }; + } + +# print "r: ", main::Dumper ($r); + + push (@{$r->{'stores'}}, { 'store' => $s, 'upd' => $tk->{'upd'} }); + } + } + + $toc; +} + sub verify_toc { my $reg= shift; my $check_item= shift; # callback: update TOC item my $hdr= shift || []; + my $reset= shift; - my @hdr1= qw(seq found store_count); + my @hdr1= qw(key seq found store_count); + # my @hdr1= qw(seq store_count); my @stores= $reg->stores(); # print "stores: ", join (', ', @stores), "\n"; exit; @@ -302,15 +367,38 @@ sub verify_toc #### my @extra_fields= (exists ($reg->{'toc_extra_fields'})) ? $reg->{'toc_extra_fields'} : (); my $c= $reg->{'proj_cat'}; + # get list of key to sequence mapping + my $fnm_key_seq= $c . '/KEY-SEQ.toc.json'; + my $KEY_SEQ; + $KEY_SEQ= TA::Util::slurp_file ($fnm_key_seq, 'json') unless ($reset); + $KEY_SEQ= {} unless (defined $KEY_SEQ); + # pick up current tocs to see if the sequence needs to be updated my %stores; foreach my $s (@stores) { - my $f= $c . '/' . $s . '.toc.json'; - my $t= TA::Util::slurp_file ($f, 'json'); - $t= {} unless (defined ($t)); # we need an empty toc if there is no toc yet +=begin comment + + my $f= $c . '/' . $s . '.toc.json'; + my $t; + $t= TA::Util::slurp_file ($f, 'json') unless ($reset); + if (defined ($t)) + { + foreach my $e (@$t) { $e->{'found'}= 0; } + } + else + { + $t= []; # we need an empty toc if there is no toc yet + } $stores{$s}= $t; + + ... dunno ... do we need the old toc? + +=end comment +=cut + + $stores{$s}= []; } my %items; @@ -329,45 +417,45 @@ sub verify_toc print "proj_cat=[$d]\n"; find (\&item_files, $d); + my $key_seq_updated= 0; # print "items: ", main::Dumper (\%items); foreach my $item (keys %items) { my $p= $items{$item}; my $j= TA::Util::slurp_file ($p->[0], 'json'); - # print "[$p->[0]] j: ", main::Dumper ($j); - my @i_stores= keys %{$j->{'store'}}; + print "[$p->[0]] j: ", main::Dumper ($j); + my $key= $j->{'key'}; - # print join (' ', $key, @i_stores), "\n"; + my $seq= $KEY_SEQ->{$key}; + unless (defined ($seq)) + { + $seq= $KEY_SEQ->{$key}= $reg->next_seq(); + $key_seq_updated++; + } # search for a key's sequence number in all known stores, not only # in those that are *currently* used for this store - my $seq; - S1: foreach my $store (@stores) + my (@i_stores, %i_stores); + E1: foreach my $jj (@{$j->{'entries'}}) { - if (exists ($stores{$store}->{$key})) - { - $seq= $stores{$store}->{$key}->{'seq'}; - last S1; - } - } + my $store= $jj->{'store'}; - S2: foreach my $store (@i_stores) - { - my $ster; # store's toc entry record ;) - unless (defined ($ster= $stores{$store}->{$key})) - { - $ster= $stores{$store}->{$key}= - { - 'seq' => $reg->next_seq(), - 'upd' => time (), - }; - } - $ster->{'found'}= 1; + print join ('/', $key, $seq, $store), "\n"; - my $jj= $j->{'store'}->{$store}; - $ster->{'store_count'}= scalar @i_stores; + $i_stores{$store}= $jj; + push (@i_stores, $store); + + my $ster= + { + 'key' => $key, + 'seq' => $seq, + 'found' => 0, # flag that indicates if object is present (not used here?) + 'upd' => time (), + }; &$check_item($j, $jj, $ster) if (defined ($check_item)); + print "ster: ", main::Dumper ($ster); + push (@{$stores{$store}}, $ster); } } @@ -377,6 +465,7 @@ sub verify_toc { my $ss= $stores{$s}; + # save TOC in json format my $f= $c . '/' . $s . '.toc.json'; print "saving toc to [$f]\n"; unless (open (TOC, '>:utf8', $f)) @@ -387,6 +476,7 @@ sub verify_toc print TOC encode_json ($ss), "\n"; close (TOC); + # save TOC in CSV format $f= $c . '/' . $s . '.toc.csv'; print "saving toc to [$f]\n"; unless (open (TOC, '>:utf8', $f)) @@ -394,20 +484,98 @@ sub verify_toc print STDERR "cant save toc file '$f'\n"; next; } - print TOC join (';', 'key', @hdr1, @$hdr), "\n"; + print TOC join (';', @hdr1, @$hdr), "\n"; - foreach my $k (keys %$ss) + foreach my $r (@$ss) { - my $r= $ss->{$k}; - print TOC join (';', $k, map { $r->{$_} } @hdr1, @$hdr), "\n"; - } + print __LINE__, " r: ", main::Dumper ($r); + print TOC join (';', map { $r->{$_} } @hdr1), ';'; + if (1 || $r->{'found'}) + { + print TOC join (';', map { $r->{$_} } @$hdr); + } + else + { + print TOC join (';', map { '' } @$hdr); + } + + print TOC "\n"; + } close (TOC); } + if ($key_seq_updated) + { + print "saving toc to [$fnm_key_seq]\n"; + unless (open (KEY_SEQ, '>:utf8', $fnm_key_seq)) + { + print STDERR "cant save toc file '$fnm_key_seq'\n"; + next; + } + print KEY_SEQ encode_json ($KEY_SEQ), "\n"; + close (KEY_SEQ); + } + # TODO: return something meaningful } +sub remove_from_store +{ + my $reg= shift; + my $store= shift; + my $drop_list= shift; # array ref containing entries: [ $md5, $path ] + # TODO: maybe a more universial format could be useful + + my $be= $reg->{'cfg'}->{'backend'}; + if ($be eq 'TA::Hasher') + { + my %drop; + foreach my $item (@$drop_list) + { + my ($id_str, $path)= @$item; + my ($r, $fnm)= $reg->ta_retrieve ($id_str); + # print "id_str=[$id_str] fnm=[$fnm] r: ", main::Dumper ($r); + + next unless (defined ($r)); # this item has possibly been deleted already + + my @new_entries= (); + my @dropped_entries= (); + foreach my $entry (@{$r->{'entries'}}) + { + if ($entry->{'store'} eq $store && $entry->{'path'} eq $path) + { + push (@dropped_entries, $entry); + } + else + { + push (@new_entries, $entry); + } + } + $drop{$id_str}= \@dropped_entries; + + if (@new_entries) + { + $r->{'entries'}= \@new_entries; + + my $j= encode_json ($r); + # print "generated json: [$j]\n"; + open (J, '>:utf8', $fnm); print J $j; close (J); + } + else + { + # print "nothing left to be saved; deleting file [$fnm]\n"; + unlink ($fnm); + } + } + return \%drop; + } + elsif ($be eq 'MongoDB') + { + die ("implement MongoDB remove"); + } +} + =head1 sequence number =head2 $reg->next_seq() @@ -531,7 +699,7 @@ sub ta_match { next REG unless ($reg->{$k} eq $search->{$k}); } - print "found match: ", main::Dumper ($reg); + # print "found match: ", main::Dumper ($reg); return ($reg, $i); } return (undef, 0); diff --git a/textarchive/vlib001.pl b/textarchive/vlib001.pl index b71b348507cee1919febee4c40f152037d3f693e..2796bf5ad654a76d7c7b970881f5436e13f3421f 100755 --- a/textarchive/vlib001.pl +++ b/textarchive/vlib001.pl @@ -32,6 +32,7 @@ $Data::Dumper::Indent= 1; use TA::ObjReg; # use TA::Hasher; # use TA::Util; +use md5cat; my @PAR= (); my $project; @@ -41,6 +42,14 @@ my $DEBUG= 0; my $STOP= 0; my $op_mode= 'refresh'; +my @hdr= qw(md5 path mtime fs_size ino); + +# --- 8< --- [from chkmd5.pl] --- +# my $Dir_Pattern= '[0-9_a-zA-Z]*'; +my $Dir_Pattern= '.'; +my $DEFAULT_file_list= "find $Dir_Pattern -xdev -type f -print|"; +# --- >8 --- + while (my $arg= shift (@ARGV)) { if ($arg eq '--') { push (@PAR, @ARGV); @ARGV= (); } @@ -96,15 +105,11 @@ if ($op_mode eq 'refresh') } print "store_cfg: ", Dumper ($store_cfg) if ($DEBUG); - if ($catalog->{'format'} eq 'md5cat') - { - refresh_md5cat ($objreg, $store); - } + if ($catalog->{'format'} eq 'md5cat') { refresh_md5cat ($objreg, $store); } + elsif ($catalog->{'format'} eq 'internal') { refresh_internal ($objreg, $store); } } elsif ($op_mode eq 'verify') { - my @hdr= qw(path_count path mtime fs_size ino); - $objreg->verify_toc (\&verify_toc_item, \@hdr); } elsif ($op_mode eq 'lookup') @@ -158,6 +163,108 @@ sub refresh_md5cat printf ("%6d files processed; %6d files updated\n", $cnt_processed, $cnt_updated); } +sub refresh_internal +{ + my $objreg= shift; + my $store= shift; + my %extra= @_; + + my $cnt_processed= 0; + my $cnt_updated= 0; + my $cnt_dropped= 0; + + $objreg->verify_toc (\&verify_toc_item, \@hdr); + my $toc= $objreg->load_single_toc ($store); + # print "toc: ", Dumper ($toc); + + my $md5cat= new md5cat (); + $md5cat->read_flist ($DEFAULT_file_list); + # print "md5cat: ", Dumper ($md5cat); + + # compare TOC and reference filelist + my $fl= $md5cat->{'FLIST'}; + my %key= (); + foreach my $x (@$toc) + { +# print __LINE__, " k=[$k]\n"; + my $k= $x->{'key'}; + my $p= $x->{'path'}; + $key{$k}->{$p}= 0; + + if (exists ($fl->{$p})) + { + $cnt_processed++; + my $f= $fl->{$p}; + my $matches= 1; + AN: foreach my $an (qw(mtime size ino)) + { + unless ($f->{$an} eq $x->{$an}) + { + # print "mismatch in [$an]! x: ", Dumper ($x); print "f: ", Dumper ($f); + $matches= 0; + last AN; + } + } + +# print "matches: $p $matches\n"; + if ($matches) + { + $f->{'state'}= 'nocheck'; + $f->{'md5'}= $x->{'md5'}; + } + } + else + { + # print "file missing: ", Dumper ($x); + $cnt_dropped++; + } + } + # my %paths= map { my $x= $toc->{$_}; $x->{'found'}= 0; $x->{'path'} => $x } keys %$toc; + # print "paths: ", Dumper (\%paths); + # print "fl: ", Dumper ($fl); + + my $new_files= $md5cat->check_new_files (); + # print "new_files: ", Dumper ($new_files); + $md5cat->integrate_md5_sums ($new_files); + # $md5cat->save_catalog (); # TODO: if save_catalog flag is true! + +# ZZZ + # update the Object registry with new items + foreach my $nf (@$new_files) + { + my ($md5, $path, $size, $mtime)= @$nf; + # print "md5=[$md5] size=[$size] path=[$path]\n"; + + $cnt_processed++; + my @upd= process_file ($md5, $path, $size); + $cnt_updated++ if (@upd); + } + + # get filelist again after reintegration to find keys which are no longer in the catalog + $fl= $md5cat->{'FLIST'}; + # print __LINE__, " fl: ", Dumper ($fl); + foreach my $p (keys %$fl) + { + $key{$fl->{$p}->{'md5'}}->{$p}= 1; + } + # print __LINE__, " key: ", Dumper (\%key); + + my @drop= (); + foreach my $k (keys %key) + { + my $x1= $key{$k}; + foreach my $p (keys %$x1) + { + push (@drop, [$k, $p]) if ($x1->{$p} == 0); + } + } + # print __LINE__, " drop: ", Dumper (\@drop); + + $objreg->remove_from_store ($store, \@drop); + + printf ("files: %6d processed; %6d updated; %6d (%d) dropped\n", $cnt_processed, $cnt_updated, $cnt_dropped, scalar (@drop)); +} + sub process_file { my ($md5, $path, $size)= @_; @@ -179,7 +286,7 @@ sub process_file my $search= { 'md5' => $md5, 'store' => $store, 'path' => $path }; my $reg= $objreg->lookup ($search); - print __LINE__, " reg: ", Dumper ($reg); + # print __LINE__, " reg: ", Dumper ($reg); my @upd; my $ydata; # pointer to file catalog data within main datastructure @@ -220,7 +327,7 @@ sub process_file if (@upd) { - print "saving (", join ('|', @upd), ")\n"; + # print "saving (", join ('|', @upd), ")\n"; # print __LINE__, " reg: ", Dumper ($reg); $objreg->save ($search, $reg); } @@ -235,15 +342,15 @@ sub verify_toc_item my $jj= shift; # this is just the part refering to the store currently processed my $ster= shift; # TOC item to be updated - my @paths= keys %{$jj->{'path'}}; - $ster->{'path_count'}= scalar @paths; - my $p1= shift (@paths); - my $px1= $jj->{'path'}->{$p1}; +# print __LINE__, " verify_toc_item: j=", Dumper ($j); +print __LINE__, " verify_toc_item: jj=", Dumper ($jj); + # my @paths= keys %{$jj->{'path'}}; + # $ster->{'path_count'}= scalar @paths; ... we don't see that this way anymore - $ster->{'path'}= $p1; - $ster->{'mtime'}= $px1->{'mtime'}; - $ster->{'fs_size'}= $px1->{'fs_size'}; - $ster->{'ino'}= $px1->{'ino'}; + foreach my $k (qw(md5 path mtime fs_size ino)) + { + $ster->{$k}= $jj->{$k}; + } } __END__