redesign toc structure

3c2444aa · Gerhard Gonter · 10355f7c · 3c2444aa · 3c2444aa
Commit 3c2444aa authored Oct 20, 2013 by Gerhard Gonter
--- a/textarchive/lib/TA/ObjReg.pm
+++ b/textarchive/lib/TA/ObjReg.pm
@@ -175,8 +175,9 @@ sub save
  my $new_reg= shift;

  my $be= $obj->{'cfg'}->{'backend'};
-  print "save [$new_reg] be=[$be]\n";
-  print main::Dumper ($new_reg);
+  # print "save [$new_reg] be=[$be]\n";
+  # print main::Dumper ($new_reg);
+
  if ($be eq 'TA::Hasher')
  {
    my $id_str= $search->{$obj->{'key'}};
@@ -210,8 +211,8 @@ sub save
    }

    my $j= encode_json ($all_reg);
-    print "fnm=[$fnm]\n";
-    print "generated json: [$j]\n";
+    # print "fnm=[$fnm]\n";
+    # print "generated json: [$j]\n";
    open (J, '>:utf8', $fnm); print J $j; close (J);
  }
  elsif ($be eq 'MongoDB')
@@ -288,13 +289,77 @@ sub load_toc_v1
  $toc;
 }

+sub load_single_toc
+{
+  my $reg= shift;
+  my $store= shift;
+  my $cache= shift;
+
+  my $c= $reg->{'proj_cat'};
+  return undef unless (defined ($c)); # not initialized?
+
+    my $f= $c . '/' . $store . '.toc.json';
+    my $t= TA::Util::slurp_file ($f, 'json');
+    if ($cache)
+    {
+      $reg->{'tocs'}->{$store}= $t;
+    }
+
+  $t;
+}
+
+sub load_multi_tocs
+{
+  my $reg= shift;
+  my $store= shift;
+  my $cache= shift;
+
+  my $c= $reg->{'proj_cat'};
+  return undef unless (defined ($c)); # not initialized?
+
+  my @stores= (defined ($store)) ? $store : $reg->stores();
+
+  return undef unless (@stores); # return nothing if there is nothing...
+
+  my $toc= {};
+  foreach my $s (@stores)
+  {
+    my $f= $c . '/' . $s . '.toc.json';
+    my $t= TA::Util::slurp_file ($f, 'json');
+    if ($cache)
+    {
+      $reg->{'tocs'}->{$s}= $t;
+    }
+
+    foreach my $k (keys %$t)
+    {
+      my $tk= $t->{$k};
+# print "k=[$k] item: ", main::Dumper ($tk);
+      my $r;
+
+      unless (defined ($r= $toc->{$k}))
+      { # not yet present in the toc
+        $toc->{$k}= $r= { 'seq' => $t->{$k}->{'seq'} };
+      }
+
+# print "r: ", main::Dumper ($r);
+
+      push (@{$r->{'stores'}}, { 'store' => $s, 'upd' => $tk->{'upd'} });
+    }
+  }
+
+  $toc;
+}
+
 sub verify_toc
 {
  my $reg= shift;
  my $check_item= shift; # callback: update TOC item
  my $hdr= shift || [];
+  my $reset= shift;

-  my @hdr1= qw(seq found store_count);
+  my @hdr1= qw(key seq found store_count);
+  # my @hdr1= qw(seq store_count);

  my @stores= $reg->stores();
  # print "stores: ", join (', ', @stores), "\n"; exit;
@@ -302,15 +367,38 @@ sub verify_toc
  #### my @extra_fields= (exists ($reg->{'toc_extra_fields'})) ? $reg->{'toc_extra_fields'} : ();
  my $c= $reg->{'proj_cat'};

+  # get list of key to sequence mapping
+  my $fnm_key_seq= $c . '/KEY-SEQ.toc.json';
+  my $KEY_SEQ;
+  $KEY_SEQ= TA::Util::slurp_file ($fnm_key_seq, 'json') unless ($reset);
+  $KEY_SEQ= {} unless (defined $KEY_SEQ);
+
  # pick up current tocs to see if the sequence needs to be updated
  my %stores;
  foreach my $s (@stores)
  {
-    my $f= $c . '/' . $s . '.toc.json';
-    my $t= TA::Util::slurp_file ($f, 'json');
-    $t= {} unless (defined ($t)); # we need an empty toc if there is no toc yet

+=begin comment
+
+    my $f= $c . '/' . $s . '.toc.json';
+    my $t;
+    $t= TA::Util::slurp_file ($f, 'json') unless ($reset);
+    if (defined ($t))
+    {
+      foreach my $e (@$t) { $e->{'found'}= 0; }
+    }
+    else
+    {
+      $t= []; # we need an empty toc if there is no toc yet
+    }
    $stores{$s}= $t;
+
+  ... dunno ... do we need the old toc?
+
+=end comment
+=cut
+
+    $stores{$s}= [];
  }

  my %items;
@@ -329,45 +417,45 @@ sub verify_toc
  print "proj_cat=[$d]\n";
  find (\&item_files, $d);

+  my $key_seq_updated= 0;
  # print "items: ", main::Dumper (\%items);
  foreach my $item (keys %items)
  {
    my $p= $items{$item};
    my $j= TA::Util::slurp_file ($p->[0], 'json');
-    # print "[$p->[0]] j: ", main::Dumper ($j);
-    my @i_stores= keys %{$j->{'store'}};
+    print "[$p->[0]] j: ", main::Dumper ($j);
+
    my $key= $j->{'key'};
-    # print join (' ', $key, @i_stores), "\n";
+    my $seq= $KEY_SEQ->{$key};
+    unless (defined ($seq))
+    {
+      $seq= $KEY_SEQ->{$key}= $reg->next_seq();
+      $key_seq_updated++;
+    }

    # search for a key's sequence number in all known stores, not only
    # in those that are *currently* used for this store
-    my $seq;
-    S1: foreach my $store (@stores)
+    my (@i_stores, %i_stores);
+    E1: foreach my $jj (@{$j->{'entries'}})
    {
-      if (exists ($stores{$store}->{$key}))
-      {
-        $seq= $stores{$store}->{$key}->{'seq'};
-        last S1;
-      }
-    }
+      my $store= $jj->{'store'};

-    S2: foreach my $store (@i_stores)
-    {
-      my $ster; # store's toc entry record ;)
-      unless (defined ($ster= $stores{$store}->{$key}))
-      {
-        $ster= $stores{$store}->{$key}=
+      print join ('/', $key, $seq, $store), "\n";
+
+      $i_stores{$store}= $jj;
+      push (@i_stores, $store);
+ 
+      my $ster=
      {
-          'seq' => $reg->next_seq(),
+        'key' => $key,
+        'seq' => $seq,
+        'found' => 0, # flag that indicates if object is present (not used here?)
        'upd' =>  time (),
      };
-      }
-      $ster->{'found'}= 1;
-
-      my $jj= $j->{'store'}->{$store};
-      $ster->{'store_count'}= scalar @i_stores;

      &$check_item($j, $jj, $ster) if (defined ($check_item));
+      print "ster: ", main::Dumper ($ster);
+      push (@{$stores{$store}}, $ster);
    }
  }

@@ -377,6 +465,7 @@ sub verify_toc
  {
    my $ss= $stores{$s};

+    # save TOC in json format
    my $f= $c . '/' . $s . '.toc.json';
    print "saving toc to [$f]\n";
    unless (open (TOC, '>:utf8', $f))
@@ -387,6 +476,7 @@ sub verify_toc
    print TOC encode_json ($ss), "\n";
    close (TOC);

+    # save TOC in CSV format
    $f= $c . '/' . $s . '.toc.csv';
    print "saving toc to [$f]\n";
    unless (open (TOC, '>:utf8', $f))
@@ -394,20 +484,98 @@ sub verify_toc
      print STDERR "cant save toc file '$f'\n";
      next;
    }
-    print TOC join (';', 'key', @hdr1, @$hdr), "\n";
+    print TOC join (';', @hdr1, @$hdr), "\n";
+
+    foreach my $r (@$ss)
+    {
+    print __LINE__, " r: ", main::Dumper ($r);
+      print TOC join (';', map { $r->{$_} } @hdr1), ';';

-    foreach my $k (keys %$ss)
+      if (1 || $r->{'found'})
+      {
+        print TOC join (';', map { $r->{$_} } @$hdr);
+      }
+      else
      {
-      my $r= $ss->{$k};
-      print TOC join (';', $k, map { $r->{$_} } @hdr1, @$hdr), "\n";
+        print TOC join (';', map { '' } @$hdr);
      }
        
+      print TOC "\n";
+    }
    close (TOC);
  }

+  if ($key_seq_updated)
+  {
+    print "saving toc to [$fnm_key_seq]\n";
+    unless (open (KEY_SEQ, '>:utf8', $fnm_key_seq))
+    {
+      print STDERR "cant save toc file '$fnm_key_seq'\n";
+      next;
+    }
+    print KEY_SEQ encode_json ($KEY_SEQ), "\n";
+    close (KEY_SEQ);
+  }
+
  # TODO: return something meaningful
 }

+sub remove_from_store
+{
+  my $reg= shift;
+  my $store= shift;
+  my $drop_list= shift; # array ref containing entries: [ $md5, $path ]
+  # TODO: maybe a more universial format could be useful
+
+  my $be= $reg->{'cfg'}->{'backend'};
+  if ($be eq 'TA::Hasher')
+  {
+    my %drop;
+    foreach my $item (@$drop_list)
+    {
+      my ($id_str, $path)= @$item;
+      my ($r, $fnm)= $reg->ta_retrieve ($id_str);
+      # print "id_str=[$id_str] fnm=[$fnm] r: ", main::Dumper ($r);
+
+      next unless (defined ($r)); # this item has possibly been deleted already
+
+      my @new_entries= ();
+      my @dropped_entries= ();
+      foreach my $entry (@{$r->{'entries'}})
+      {
+        if ($entry->{'store'} eq $store && $entry->{'path'} eq $path)
+        {
+          push (@dropped_entries, $entry);
+        }
+        else
+        {
+          push (@new_entries, $entry);
+        }
+      }
+      $drop{$id_str}= \@dropped_entries;
+
+      if (@new_entries)
+      {
+        $r->{'entries'}= \@new_entries;
+
+        my $j= encode_json ($r);
+        # print "generated json: [$j]\n";
+        open (J, '>:utf8', $fnm); print J $j; close (J);
+      }
+      else
+      {
+        # print "nothing left to be saved; deleting file [$fnm]\n";
+        unlink ($fnm);
+      }
+    }
+    return \%drop;
+  }
+  elsif ($be eq 'MongoDB')
+  {
+    die ("implement MongoDB remove");
+  }
+}
+
 =head1 sequence number

 =head2 $reg->next_seq()
@@ -531,7 +699,7 @@ sub ta_match
    {
      next REG unless ($reg->{$k} eq $search->{$k});
    }
-    print "found match: ", main::Dumper ($reg);
+    # print "found match: ", main::Dumper ($reg);
    return ($reg, $i);
  }
  return (undef, 0);

--- a/textarchive/vlib001.pl
+++ b/textarchive/vlib001.pl
@@ -32,6 +32,7 @@ $Data::Dumper::Indent= 1;
 use TA::ObjReg;
 # use TA::Hasher;
 # use TA::Util;
+use md5cat;

 my @PAR= ();
 my $project;
@@ -41,6 +42,14 @@ my $DEBUG= 0;
 my $STOP= 0;
 my $op_mode= 'refresh';

+my @hdr= qw(md5 path mtime fs_size ino);
+
+# --- 8< --- [from chkmd5.pl] ---
+# my $Dir_Pattern= '[0-9_a-zA-Z]*';
+my $Dir_Pattern= '.';
+my $DEFAULT_file_list= "find $Dir_Pattern -xdev -type f -print|";
+# --- >8 ---
+
 while (my $arg= shift (@ARGV))
 {
     if ($arg eq '--') { push (@PAR, @ARGV); @ARGV= (); }
@@ -96,15 +105,11 @@ if ($op_mode eq 'refresh')
  }
  print "store_cfg: ", Dumper ($store_cfg) if ($DEBUG);

-  if ($catalog->{'format'} eq 'md5cat')
-  {
-    refresh_md5cat ($objreg, $store);
-  }
+     if ($catalog->{'format'} eq 'md5cat')   { refresh_md5cat   ($objreg, $store); }
+  elsif ($catalog->{'format'} eq 'internal') { refresh_internal ($objreg, $store); }
 }
 elsif ($op_mode eq 'verify')
 {
-  my @hdr= qw(path_count path mtime fs_size ino);
-
  $objreg->verify_toc (\&verify_toc_item, \@hdr);
 }
 elsif ($op_mode eq 'lookup')
@@ -158,6 +163,108 @@ sub refresh_md5cat
  printf ("%6d files processed; %6d files updated\n", $cnt_processed, $cnt_updated);
 }

+sub refresh_internal
+{
+  my $objreg= shift;
+  my $store= shift;
+  my %extra= @_;
+
+  my $cnt_processed= 0;
+  my $cnt_updated= 0;
+  my $cnt_dropped= 0;
+
+  $objreg->verify_toc (\&verify_toc_item, \@hdr);
+  my $toc= $objreg->load_single_toc ($store);
+  # print "toc: ", Dumper ($toc);
+
+  my $md5cat= new md5cat ();
+  $md5cat->read_flist ($DEFAULT_file_list);
+  # print "md5cat: ", Dumper ($md5cat);
+
+  # compare TOC and reference filelist
+  my $fl= $md5cat->{'FLIST'};
+  my %key= ();
+  foreach my $x (@$toc)
+  {
+# print __LINE__, " k=[$k]\n";
+    my $k= $x->{'key'};
+    my $p= $x->{'path'};
+    $key{$k}->{$p}= 0;
+
+    if (exists ($fl->{$p}))
+    {
+      $cnt_processed++;
+      my $f= $fl->{$p};
+      my $matches= 1;
+      AN: foreach my $an (qw(mtime size ino))
+      {
+        unless ($f->{$an} eq $x->{$an})
+        {
+          # print "mismatch in [$an]! x: ", Dumper ($x); print "f: ", Dumper ($f);
+          $matches= 0;
+          last AN;
+        }
+      }
+
+# print "matches: $p $matches\n";
+      if ($matches)
+      {
+        $f->{'state'}= 'nocheck';
+        $f->{'md5'}= $x->{'md5'};
+      }
+    }
+    else
+    {
+      # print "file missing: ", Dumper ($x);
+      $cnt_dropped++;
+    }
+  }
+  # my %paths= map { my $x= $toc->{$_}; $x->{'found'}= 0; $x->{'path'} => $x } keys %$toc;
+  # print "paths: ", Dumper (\%paths);
+  # print "fl: ", Dumper ($fl);
+
+  my $new_files= $md5cat->check_new_files ();
+  # print "new_files: ", Dumper ($new_files);
+  $md5cat->integrate_md5_sums ($new_files);
+  # $md5cat->save_catalog (); # TODO: if save_catalog flag is true!
+
+# ZZZ
+  # update the Object registry with new items
+  foreach my $nf (@$new_files)
+  {
+    my ($md5, $path, $size, $mtime)= @$nf;
+    # print "md5=[$md5] size=[$size] path=[$path]\n";
+
+    $cnt_processed++;
+    my @upd= process_file ($md5, $path, $size);
+    $cnt_updated++ if (@upd);
+  }
+
+  # get filelist again after reintegration to find keys which are no longer in the catalog
+  $fl= $md5cat->{'FLIST'};
+  # print __LINE__, " fl: ", Dumper ($fl);
+  foreach my $p (keys %$fl)
+  {
+    $key{$fl->{$p}->{'md5'}}->{$p}= 1;
+  }
+  # print __LINE__, " key: ", Dumper (\%key);
+
+  my @drop= ();
+  foreach my $k (keys %key)
+  {
+    my $x1= $key{$k};
+    foreach my $p (keys %$x1)
+    {
+      push (@drop, [$k, $p]) if ($x1->{$p} == 0);
+    }
+  }
+  # print __LINE__, " drop: ", Dumper (\@drop);
+
+  $objreg->remove_from_store ($store, \@drop);
+
+  printf ("files: %6d processed; %6d updated; %6d (%d) dropped\n", $cnt_processed, $cnt_updated, $cnt_dropped, scalar (@drop));
+}
+
 sub process_file
 {
  my ($md5, $path, $size)= @_;
@@ -179,7 +286,7 @@ sub process_file

    my $search= { 'md5' => $md5, 'store' => $store, 'path' => $path };
    my $reg= $objreg->lookup ($search);
-    print __LINE__, " reg: ", Dumper ($reg);
+    # print __LINE__, " reg: ", Dumper ($reg);

    my @upd;
    my $ydata;   # pointer to file catalog data within main datastructure
@@ -220,7 +327,7 @@ sub process_file

  if (@upd)
  {
-    print "saving (", join ('|', @upd), ")\n";
+    # print "saving (", join ('|', @upd), ")\n";
    # print __LINE__, " reg: ", Dumper ($reg);
    $objreg->save ($search, $reg);
  }
@@ -235,15 +342,15 @@ sub verify_toc_item
  my $jj= shift;    # this is just the part refering to the store currently processed
  my $ster= shift;  # TOC item to be updated

-  my @paths= keys %{$jj->{'path'}};
-  $ster->{'path_count'}= scalar @paths;
-  my $p1= shift (@paths);
-  my $px1= $jj->{'path'}->{$p1};
+# print __LINE__, " verify_toc_item: j=", Dumper ($j);
+print __LINE__, " verify_toc_item: jj=", Dumper ($jj);
+  # my @paths= keys %{$jj->{'path'}};
+  # $ster->{'path_count'}= scalar @paths;  ... we don't see that this way anymore

-  $ster->{'path'}= $p1;
-  $ster->{'mtime'}= $px1->{'mtime'};
-  $ster->{'fs_size'}= $px1->{'fs_size'};
-  $ster->{'ino'}= $px1->{'ino'};
+  foreach my $k (qw(md5 path mtime fs_size ino))
+  {
+    $ster->{$k}= $jj->{$k};
+  }
 }

 __END__