From 10355f7c743fe235578b9463bf8b4685db762d54 Mon Sep 17 00:00:00 2001
From: Gerhard Gonter <ggonter@gmail.com>
Date: Sat, 19 Oct 2013 04:31:07 +0200
Subject: [PATCH] restructured data model to be compatible with MongoDB which
 was also added as a possible backend for storage.

---
 textarchive/lib/TA/ObjReg.pm | 216 +++++++++++++++++++++++++++++------
 textarchive/vlib001.pl       |  35 +++---
 2 files changed, 196 insertions(+), 55 deletions(-)

diff --git a/textarchive/lib/TA/ObjReg.pm b/textarchive/lib/TA/ObjReg.pm
index 9811e0e..114cd92 100644
--- a/textarchive/lib/TA/ObjReg.pm
+++ b/textarchive/lib/TA/ObjReg.pm
@@ -18,7 +18,8 @@ use JSON;
 use File::Find;
 
 use TA::Util;
-use TA::Hasher;
+
+my %plugins_loaded= ();
 
 sub new
 {
@@ -77,13 +78,35 @@ sub get_project
   # print "proj_cfg: ", main::Dumper ($proj_cfg);
   # TODO: check authorization (no need, if local, but for client-server, we need something
 
-  # initialize hasher
-  my $base_dir= $obj->{'proj_cfg_dir'};
-  $obj->{'proj_cat'}= my $proj_cat= join ('/', $base_dir, 'cat');
-  $obj->{'hasher'}= my $hasher= new TA::Hasher ('algorithm' => $proj_cfg->{'algorithm'}, 'pfx' => $proj_cat, 'name' => 'file');
+  my $be= $proj_cfg->{'backend'};
+  unless (exists ($plugins_loaded{$be}))
+  {
+       if ($be eq 'TA::Hasher')  { require TA::Hasher; }
+    elsif ($be eq 'TA::UrxnBla') { require TA::UrxnBla; }
+    elsif ($be eq 'MongoDB')     { require MongoDB; }
+    else
+    {
+      print "ATTN: unknown backend '$be'\n";
+      return undef;
+    }
+    $plugins_loaded{$be}= 1;
+  }
+
+  if ($be eq 'TA::Hasher')
+  {
+    # initialize hasher
+    my $ta= $proj_cfg->{'TA::Hasher'};
+    $ta->{'name'}= 'file';
+    $ta->{'pfx'}= $obj->{'proj_cat'}= my $proj_cat= join ('/', $proj_cfg_dir, 'cat');
+    $obj->{'hasher'}= my $hasher= new TA::Hasher (%$ta);
+  }
+  elsif ($be eq 'MongoDB')
+  {
+    $obj->connect_MongoDB ($proj_cfg);
+  }
 
   # get sequence number
-  $obj->{'seq_file'}= my $fnm_seq= join ('/', $base_dir, 'sequence.json');
+  $obj->{'seq_file'}= my $fnm_seq= join ('/', $proj_cfg_dir, 'sequence.json');
   $obj->{'seq'}= my $seq= TA::Util::slurp_file ($fnm_seq, 'json');
   # print "seq: ", main::Dumper ($seq);
   unless (defined ($seq))
@@ -121,20 +144,26 @@ returns that keys value, if present, otherwise, undef.
 sub lookup
 {
   my $obj= shift;
-  my $id_str= shift;
-
-  # print "lookup [$id_str]\n";
-  my @r= $obj->{'hasher'}->check_file ($id_str, 0);
-  # print "id_str=[$id_str] r=", main::Dumper (\@r);
-  my ($rc, $path)= @r;
-
-  my $fnm= $path . '/' . $id_str . '.json';
-  # print "description: [$fnm]\n";
+  my $search= shift;
 
-  my @st= stat ($fnm);
-  return undef unless (@st);
+  my $be= $obj->{'cfg'}->{'backend'};
+  print "lookup [$search] be=[$be]\n";
+  print main::Dumper ($search);
 
-  my $reg= TA::Util::slurp_file ($fnm, 'json');
+  my $reg;
+  if ($be eq 'TA::Hasher')
+  {
+    my $id_str= $search->{$obj->{'key'}};
+    my ($all_reg, $fnm)= $obj->ta_retrieve ($id_str, 0);
+    print "fnm=[$fnm] all_reg: ", main::Dumper ($all_reg);
+    return undef unless (defined ($all_reg));
+    ($reg)= ta_match ($all_reg, $search);
+  }
+  elsif ($be eq 'MongoDB')
+  {
+    $reg= $obj->{'_col'}->find_one ( $search );
+  }
+  # print "reg: ", main::Dumper ($reg);
  
   return $reg;
 }
@@ -142,25 +171,54 @@ sub lookup
 sub save
 {
   my $obj= shift;
-  my $id_str= shift;
+  my $search= shift;
   my $new_reg= shift;
 
-  print "save [$id_str]\n";
-  my @r= $obj->{'hasher'}->check_file ($id_str, 1);
-  # print "id_str=[$id_str] r=", main::Dumper (\@r);
-  my ($rc, $path)= @r;
+  my $be= $obj->{'cfg'}->{'backend'};
+  print "save [$new_reg] be=[$be]\n";
+  print main::Dumper ($new_reg);
+  if ($be eq 'TA::Hasher')
+  {
+    my $id_str= $search->{$obj->{'key'}};
+    my ($all_reg, $fnm)= $obj->ta_retrieve ($id_str, 1);
 
-  my $fnm= $path . '/' . $id_str . '.json';
-  # print "description: [$fnm]\n";
+=begin comment
 
-  my @st= stat ($fnm);
-  unless (@st)
-  { # TODO: increment sequence and toc
-  }
+    my @st= stat ($fnm);
+    unless (@st)
+    { # TODO: increment sequence and toc
+    }
+
+=end comment
+=cut
 
-  my $j= encode_json ($new_reg);
-  # print "generated json: [$j]\n";
-  open (J, '>:utf8', $fnm); print J $j; close (J);
+    if (defined ($all_reg))
+    {
+      my ($reg, $idx)= ta_match ($all_reg, $search);
+      if (defined ($reg))
+      {
+        $all_reg->{'entries'}->[$idx]= $new_reg;
+      }
+      else
+      {
+        push (@{$all_reg->{'entries'}}, $new_reg);
+      }
+    }
+    else
+    {
+      $all_reg= { 'key' => $id_str, 'entries' => [ $new_reg ] }
+    }
+
+    my $j= encode_json ($all_reg);
+    print "fnm=[$fnm]\n";
+    print "generated json: [$j]\n";
+    open (J, '>:utf8', $fnm); print J $j; close (J);
+  }
+  elsif ($be eq 'MongoDB')
+  {
+    print "new_reg: ", main::Dumper ($new_reg);
+    $obj->{'_col'}->insert ($new_reg);
+  }
 }
 
 =head1 TOC: Table of Contents
@@ -385,7 +443,99 @@ sub next_seq
   $seq->{'seq'};
 }
 
-# =head1 INTERNAL FUNCTIONS
+=head1 INTERNAL METHODS
+
+=head2 $mongo_collection= $obj->connect_MongoDB ($config);
+
+Connect to MongoDB with connection parameters in hash_ref $config and
+returns the MongoDB collection object.
+
+$config needs the following attribues: host, db, user, pass, collection
+
+=cut
+
+sub connect_MongoDB
+{
+  my $obj= shift;
+  my $cfg= shift;
+
+  my $cmm= $cfg->{'MongoDB'};
+  print "cmm: ", main::Dumper ($cmm);
+
+  my $col;
+  eval
+  {
+    my $connection= MongoDB::Connection->new(host => $cmm->{'host'});
+    $connection->authenticate($cmm->{'db'}, $cmm->{'user'}, $cmm->{'pass'});
+    my $db= $connection->get_database($cmm->{'db'});
+    $col= $db->get_collection($cmm->{'collection'});
+    print "col: [$col]\n";
+  };
+  if ($@)
+  {
+    print "ATTN: can't connect to MongoDB ", (join ('/', map { $cmm->{$_} } qw(host user collection))), "\n";
+    return undef;
+  }
+
+  return $obj->{'_col'}= $col;
+}
+
+=head2 ($data, $fnm)= $objreg->ta_retrieve ($key, $create)
+
+Retrieve and return data referenced by $key and returns path name of
+that file.  If $create is true, the path leading to that file is created,
+when it is not already present.
+
+=cut
+
+sub ta_retrieve
+{
+  my $obj= shift;
+  my $id_str= shift;
+  my $create= shift;
+
+    my @r= $obj->{'hasher'}->check_file ($id_str, $create);
+    # print "id_str=[$id_str] r=", main::Dumper (\@r);
+    my ($rc, $path)= @r;
+
+    my $fnm= $path . '/' . $id_str . '.json';
+    # print "description: [$fnm]\n";
+
+    my @st= stat ($fnm);
+    return (undef, $fnm) unless (@st);
+
+    my $all_reg= TA::Util::slurp_file ($fnm, 'json');
+
+  return ($all_reg, $fnm);
+}
+
+=head1 INTERNAL FUNCTIONS
+
+=head2 ($entry, $index)= ta_match ($data, $search)
+
+Select first $entry from $data that matches hash ref $search.
+
+=cut
+
+sub ta_match
+{
+  my $all_reg= shift;
+  my $search= shift;
+
+  my @k= keys $search;
+  my @e= @{$all_reg->{'entries'}};
+  REG: for (my $i= 0; $i <= $#e; $i++)
+  {
+    my $reg= $e[$i];
+    foreach my $k (@k)
+    {
+      next REG unless ($reg->{$k} eq $search->{$k});
+    }
+    print "found match: ", main::Dumper ($reg);
+    return ($reg, $i);
+  }
+  return (undef, 0);
+}
 
 1;
 __END__
diff --git a/textarchive/vlib001.pl b/textarchive/vlib001.pl
index 7705021..b71b348 100755
--- a/textarchive/vlib001.pl
+++ b/textarchive/vlib001.pl
@@ -77,7 +77,7 @@ unless (defined ($project))
 }
 # &usage ('no store specified') unless (defined ($store));
 
-my $objreg= new TA::ObjReg ('project' => $project, 'store' => $store);
+my $objreg= new TA::ObjReg ('project' => $project, 'store' => $store, 'key' => 'md5');
 &usage ('no config found') unless (defined ($objreg));
 print "objreg: ", Dumper ($objreg) if ($DEBUG || $STOP);
 exit if ($STOP);
@@ -171,48 +171,39 @@ sub process_file
 
     my $xdata=
     {
+      # 'key' => $md5, 'key_type' => 'md5',
+      'store' => $store,
       'c_size' => $size, 'path' => $path, 'md5' => $md5,
       'mtime' => $st[9], 'fs_size' => $st[7], 'ino' => $st[1]
     };
 
-    my $reg= $objreg->lookup ($md5);
+    my $search= { 'md5' => $md5, 'store' => $store, 'path' => $path };
+    my $reg= $objreg->lookup ($search);
+    print __LINE__, " reg: ", Dumper ($reg);
 
     my @upd;
     my $ydata;   # pointer to file catalog data within main datastructure
     if (defined ($reg))
     { # we know something about this key value but not in respect to the repository at hand
       # print "json read: ", main::Dumper ($reg);
-      my $sb;
-      if (defined ($sb= $reg->{'store'}->{$store})
-          && exists ($sb->{'path'})
-          && defined ($ydata= $sb->{'path'}->{$path}) # we need to keep track of the path as well otherwise we can't handly duplicates in the same store
-          && $st[7] == $ydata->{'fs_size'}
-          && $st[9] == $ydata->{'mtime'}
-        )
-      { # compare stored and current information and update if necessary
         foreach my $an (keys %$xdata)
         {
-          unless ($ydata->{$an} eq $xdata->{$an})
+          unless ($reg->{$an} eq $xdata->{$an})
           {
-            $ydata->{$an}= $xdata->{$an};
+            $reg->{$an}= $xdata->{$an};
             push (@upd, $an);
           }
         }
-      }
-      else
-      {
-        $reg->{'store'}->{$store}->{'path'}->{$path}= $ydata= $xdata;
-        push (@upd, 'store upd');
-      }
     }
     else
     { # this key is new, so we simply place what we know in the newly created registry item
-      $reg= { 'key' => $md5, 'key_type' => 'md5', 'store' => { $store => { 'path' => { $path => $ydata= $xdata } } } };
+      # $reg= { 'key' => $md5, 'key_type' => 'md5', 'store' => { $store => $ydata= $xdata } };
+      $reg= $xdata;
       push (@upd, 'new key');
     }
 
     # fill in some more information about that file
-    if (!exists ($ydata->{'fileinfo'}) || $refresh_fileinfo)
+    if (!exists ($reg->{'fileinfo'}) || $refresh_fileinfo)
     {
       my $xpath= $path;
       $xpath=~ s#'#'\\''#g;
@@ -220,7 +211,7 @@ sub process_file
       chop ($res);
 
       my ($xpath, $fileinfo)= split (/: */, $res, 2);
-      $ydata->{'fileinfo'}= $fileinfo;
+      $reg->{'fileinfo'}= $fileinfo;
       push (@upd, 'fileinfo updated');
     }
 
@@ -231,7 +222,7 @@ sub process_file
   {
     print "saving (", join ('|', @upd), ")\n";
     # print __LINE__, " reg: ", Dumper ($reg);
-    $objreg->save ($md5, $reg);
+    $objreg->save ($search, $reg);
   }
 
   (wantarray) ? @upd : \@upd;
-- 
GitLab