From 3ed357a1284669005f13698230155555c7b30cc4 Mon Sep 17 00:00:00 2001
From: Gerhard Gonter <ggonter@gmail.com>
Date: Tue, 3 Sep 2013 09:41:37 +0200
Subject: [PATCH] added code for toc and seq

---
 textarchive/lib/TA/ObjReg.pm | 259 ++++++++++++++++++++++++++++++++++-
 textarchive/vlib001.pl       |  38 +++--
 2 files changed, 282 insertions(+), 15 deletions(-)

diff --git a/textarchive/lib/TA/ObjReg.pm b/textarchive/lib/TA/ObjReg.pm
index 827acd5..cb76270 100644
--- a/textarchive/lib/TA/ObjReg.pm
+++ b/textarchive/lib/TA/ObjReg.pm
@@ -1,9 +1,22 @@
+#
+# File: lib/TA/ObjReg.pm
+#
 
 package TA::ObjReg;
 
+=head1 NAME
+
+  TA::ObjReg  -- Text-Archive Object Registry
+
+=head1 DESCRIPTION
+
+=cut
+
+
 use strict;
 
 use JSON;
+use File::Find;
 
 use TA::Util;
 use TA::Hasher;
@@ -15,7 +28,7 @@ sub new
 
   # check the presence of all required parameters
   my $stopit= 0;
-  foreach my $k (qw(project store))
+  foreach my $k (qw(project))
   {
     unless (exists ($par{$k}))
     {
@@ -39,6 +52,14 @@ sub new
   $obj;
 }
 
+=head1 project level methods
+
+=head2 $reg->get_project()
+
+(re)loads the project related data structures
+
+=cut
+
 sub get_project
 {
   my $obj= shift;
@@ -55,15 +76,50 @@ sub get_project
   }
 
   # print "proj_cfg: ", main::Dumper ($proj_cfg);
-  # TODO: check authorization (no need, if local, but for client-server, we need something1
+  # TODO: check authorization (no need, if local, but for client-server, we need something
 
+  # initialize hasher
   my $base_dir= $obj->{'proj_cfg_dir'};
   $obj->{'proj_cat'}= my $proj_cat= join ('/', $base_dir, 'cat');
   $obj->{'hasher'}= my $hasher= new TA::Hasher ('algorithm' => $proj_cfg->{'algorithm'}, 'pfx' => $proj_cat, 'name' => 'file');
 
+
+  # get sequence number
+  $obj->{'seq_file'}= my $fnm_seq= join ('/', $base_dir, 'sequence.json');
+  $obj->{'seq'}= my $seq= TA::Util::slurp_file ($fnm_seq, 'json');
+  # print "seq: ", main::Dumper ($seq);
+  unless (defined ($seq))
+  {
+    $obj->{'seq'}= $seq= { 'seq' => 0, 'upd' => time () };
+    $obj->_save_seq ();
+  }
+
   $proj_cfg;
 }
 
+=head2 $reg->stores()
+
+returns a list of all stores in the project
+
+=cut
+
+sub stores
+{
+  my $reg= shift;
+
+  my @stores= keys %{$reg->{'cfg'}->{'stores'}};
+
+  (wantarray) ? @stores : \@stores;
+}
+
+=head1 item related methods
+
+=head2 $reg->lookup($key)
+
+returns that keys value, if present, otherwise, undef.
+
+=cut
+
 sub lookup
 {
   my $obj= shift;
@@ -99,11 +155,210 @@ sub save
   my $fnm= $path . '/' . $id_str . '.json';
   # print "description: [$fnm]\n";
 
+  my @st= stat ($fnm);
+  unless (@st)
+  { # TODO: increment sequence and toc
+  }
+
   my $j= encode_json ($new_reg);
   # print "generated json: [$j]\n";
   open (J, '>:utf8', $fnm); print J $j; close (J);
 }
 
+=head1 TOC: Table of Contents
+
+single TOC format:
+   key:
+   {
+     "seq": number, # this items sequence number
+     "upd": epoch   # 
+   }
+
+global TOC format:
+   key:
+   {
+     "seq": number,
+     "stores": [ { store-id: ..., "upd": epoch } ]
+   }
+
+The toc file is stored in:
+
+  <project>/cat/<store-id>.toc.json
+
+=head2 $reg->load_toc ($store)
+
+returns toc hashed by key.
+
+if $store is undef, returns a toc of all stores
+
+=cut
+
+sub load_toc
+{
+  my $reg= shift;
+  my $store= shift;
+  my $cache= shift;
+
+  my $c= $reg->{'proj_cat'};
+  return undef unless (defined ($c)); # not initialized?
+
+  my @stores= (defined ($store)) ? $store : $reg->stores();
+
+  return undef unless (@stores); # return nothing if there is nothing...
+
+  my $toc= {};
+  foreach my $s (@stores)
+  {
+    my $f= $c . '/' . $s . '.toc.json';
+    my $t= TA::Util::slurp_file ($f, 'json');
+    if ($cache)
+    {
+      $reg->{'tocs'}->{$s}= $t;
+    }
+
+    foreach my $k (keys %$t)
+    {
+      my $r;
+
+      unless (defined ($r= $toc->{$k}))
+      { # not yet present in the toc
+        $toc->{$k}= $r= { 'sequence' => $k->{'sequence'} };
+      }
+
+      push (@{$r->{'stores'}}, { 'store' => $s, 'upd' => $k->{'upd'} });
+    }
+  }
+
+  $toc;
+}
+
+sub verify_toc
+{
+  my $reg= shift;
+
+  # my $store= shift; this does not make sense, we need to verify verything anyway
+
+  # my @stores= (defined ($store)) ? $store : $reg->stores();
+  my @stores= $reg->stores();
+  # print "stores: ", join (', ', @stores), "\n"; exit;
+  my %stores;
+
+  my $c= $reg->{'proj_cat'};
+  # pick up current tocs to see if the sequence needs to be updated
+  foreach my $s (@stores)
+  {
+    my $f= $c . '/' . $s . '.toc.json';
+    my $t= TA::Util::slurp_file ($f, 'json');
+    $t= {} unless (defined ($t)); # we need an empty toc if there is none yet
+
+    $stores{$s}= $t;
+  }
+
+  my %items;
+  sub item_files
+  {
+    next if ($_ =~ /\.toc\.json$/);
+    my $x;
+    next unless ($_ =~ /\.json$/ && -f ($x= $File::Find::name));
+
+    # print "file=[$_] path=[$x]\n";
+    $items{$_}= [ $x ];
+  }
+
+  my $d= $reg->{'proj_cat'};
+  print "proj_cat=[$d]\n";
+  find (\&item_files, $d);
+
+  # print "items: ", main::Dumper (\%items);
+  foreach my $item (keys %items)
+  {
+    my $p= $items{$item};
+    my $j= TA::Util::slurp_file ($p->[0], 'json');
+    # print "j: ", main::Dumper ($j);
+    my @i_stores= keys %{$j->{'store'}};
+    my $key= $j->{'key'};
+    print join (' ', $key, @i_stores), "\n";
+
+    # search for a key's sequence number in all known stores, not only
+    # in those that are *currently* used for this store
+    my $seq;
+    S1: foreach my $store (@stores)
+    {
+      if (exists ($stores{$store}->{$key}))
+      {
+        $seq= $stores{$store}->{$key}->{'seq'};
+        last S1;
+      }
+    }
+
+    S2: foreach my $store (@i_stores)
+    {
+      my $ster; # store's toc entry record ;)
+      unless (defined ($ster= $stores{$store}->{$key}))
+      {
+        $ster= $stores{$store}->{$key}=
+        {
+          'seq' => $reg->next_seq(),
+          'upd' =>  time (),
+        };
+      }
+      $ster->{'found'}= 1;
+    }
+  }
+
+  print "finishing\n";
+  # save all tocs now
+  foreach my $s (@stores)
+  {
+    my $f= $c . '/' . $s . '.toc.json';
+    print "saving toc to [$f]\n";
+    unless (open (TOC, '>:utf8', $f))
+    {
+      print STDERR "cant save toc file '$f'\n";
+      next;
+    }
+    print TOC encode_json ($stores{$s}), "\n";
+    close (TOC);
+  }
+
+  # TODO: return something meaningful
+}
+
+=head1 sequence number
+
+=head2 $reg->next_seq()
+
+=cut
+
+sub flush
+{
+  my $reg= shift;
+
+  $reg->_save_seq ();
+}
+
+sub _save_seq
+{
+  my $reg= shift;
+
+  my $f= $reg->{'seq_file'};
+  open (F_SEQ, '>:utf8', $f) or die "cant write sequence to '$f'";
+  print F_SEQ encode_json ($reg->{'seq'}), "\n";
+  close (F_SEQ);
+}
+
+sub next_seq
+{
+  my $reg= shift;
+
+  my $seq= $reg->{'seq'};
+  $seq->{'seq'}++;
+  $seq->{'upd'}= time ();
+  $reg->_save_seq (); # TODO: optionally delay that until $reg->flush();
+
+  $seq->{'seq'};
+}
+
 # =head1 INTERNAL FUNCTIONS
 
 1;
diff --git a/textarchive/vlib001.pl b/textarchive/vlib001.pl
index 0e54aba..5f121ff 100755
--- a/textarchive/vlib001.pl
+++ b/textarchive/vlib001.pl
@@ -33,9 +33,11 @@ use TA::ObjReg;
 
 my @PAR= ();
 my $project;
-my $store= '<none>';
+my $store;
 my $refresh_fileinfo= 0;
 my $DEBUG= 0;
+my $STOP= 0;
+my $op_mode= 'refresh';
 
 while (my $arg= shift (@ARGV))
 {
@@ -45,6 +47,7 @@ while (my $arg= shift (@ARGV))
        if ($arg eq '--project')  { $project= shift (@ARGV); }
     elsif ($arg eq '--store')    { $store= shift (@ARGV); }
     elsif ($arg eq '--fileinfo') { $refresh_fileinfo= 1; }
+    elsif ($arg =~ /^--(refresh|verify)$/) { $op_mode= $1; }
   }
   elsif ($arg =~ /^-/)
   {
@@ -54,6 +57,7 @@ while (my $arg= shift (@ARGV))
          if ($a eq 'p') { $project= shift (@ARGV); }
       elsif ($a eq 's') { $store= shift (@ARGV); }
       elsif ($a eq 'D') { $DEBUG++; }
+      elsif ($a eq 'X') { $STOP= 1; }
     }
   }
   else { push (@PAR, $arg); }
@@ -67,8 +71,11 @@ print "debug level: $DEBUG\n";
 
 my $objreg= new TA::ObjReg ('project' => $project, 'store' => $store);
 &usage ('no config found') unless (defined ($objreg));
-print "objreg: ", Dumper ($objreg) if ($DEBUG);
+print "objreg: ", Dumper ($objreg) if ($DEBUG || $STOP);
+exit if ($STOP);
 
+if ($op_mode eq 'refresh')
+{
 my $catalog= $objreg->{'cfg'}->{'catalog'};
 &usage ('no catalog found in config') unless (defined ($catalog));
 
@@ -81,15 +88,28 @@ unless (defined ($store_cfg))
 }
 print "store_cfg: ", Dumper ($store_cfg) if ($DEBUG);
 
-if ($catalog->{'format'} eq 'md5cat')
+  if ($catalog->{'format'} eq 'md5cat')
+  {
+    refresh_md5cat ($objreg, $store);
+  }
+}
+elsif ($op_mode eq 'verify')
 {
-  refresh_md5cat ($objreg, $store);
+  $objreg->verify_toc ($store);
 }
 
 # print "objreg: (after refresh)", Dumper ($objreg);
 
 exit (0);
 
+sub usage
+{
+  my $msg= shift;
+  print $msg, "\n";
+  system ("perldoc $0");
+  exit -1;
+}
+
 sub refresh_md5cat
 {
   my $objreg= shift;
@@ -165,7 +185,7 @@ sub process_file
     }
     else
     {
-      $reg= { 'md5' => $md5, 'store' => { $store => { 'path' => { $path => $ydata= $xdata } } } };
+      $reg= { 'key' => $md5, 'store' => { $store => { 'path' => { $path => $ydata= $xdata } } } };
       push (@upd, 'new md5');
     }
 
@@ -193,14 +213,6 @@ sub process_file
   (wantarray) ? @upd : \@upd;
 }
 
-sub usage
-{
-  my $msg= shift;
-  print $msg, "\n";
-  system ("perldoc $0");
-  exit -1;
-}
-
 __END__
 
 =head1 TODO
-- 
GitLab