From 10355f7c743fe235578b9463bf8b4685db762d54 Mon Sep 17 00:00:00 2001 From: Gerhard Gonter <ggonter@gmail.com> Date: Sat, 19 Oct 2013 04:31:07 +0200 Subject: [PATCH] restructured data model to be compatible with MongoDB which was also added as a possible backend for storage. --- textarchive/lib/TA/ObjReg.pm | 216 +++++++++++++++++++++++++++++------ textarchive/vlib001.pl | 35 +++--- 2 files changed, 196 insertions(+), 55 deletions(-) diff --git a/textarchive/lib/TA/ObjReg.pm b/textarchive/lib/TA/ObjReg.pm index 9811e0e..114cd92 100644 --- a/textarchive/lib/TA/ObjReg.pm +++ b/textarchive/lib/TA/ObjReg.pm @@ -18,7 +18,8 @@ use JSON; use File::Find; use TA::Util; -use TA::Hasher; + +my %plugins_loaded= (); sub new { @@ -77,13 +78,35 @@ sub get_project # print "proj_cfg: ", main::Dumper ($proj_cfg); # TODO: check authorization (no need, if local, but for client-server, we need something - # initialize hasher - my $base_dir= $obj->{'proj_cfg_dir'}; - $obj->{'proj_cat'}= my $proj_cat= join ('/', $base_dir, 'cat'); - $obj->{'hasher'}= my $hasher= new TA::Hasher ('algorithm' => $proj_cfg->{'algorithm'}, 'pfx' => $proj_cat, 'name' => 'file'); + my $be= $proj_cfg->{'backend'}; + unless (exists ($plugins_loaded{$be})) + { + if ($be eq 'TA::Hasher') { require TA::Hasher; } + elsif ($be eq 'TA::UrxnBla') { require TA::UrxnBla; } + elsif ($be eq 'MongoDB') { require MongoDB; } + else + { + print "ATTN: unknown backend '$be'\n"; + return undef; + } + $plugins_loaded{$be}= 1; + } + + if ($be eq 'TA::Hasher') + { + # initialize hasher + my $ta= $proj_cfg->{'TA::Hasher'}; + $ta->{'name'}= 'file'; + $ta->{'pfx'}= $obj->{'proj_cat'}= my $proj_cat= join ('/', $proj_cfg_dir, 'cat'); + $obj->{'hasher'}= my $hasher= new TA::Hasher (%$ta); + } + elsif ($be eq 'MongoDB') + { + $obj->connect_MongoDB ($proj_cfg); + } # get sequence number - $obj->{'seq_file'}= my $fnm_seq= join ('/', $base_dir, 'sequence.json'); + $obj->{'seq_file'}= my $fnm_seq= join ('/', $proj_cfg_dir, 'sequence.json'); $obj->{'seq'}= my $seq= TA::Util::slurp_file ($fnm_seq, 'json'); # print "seq: ", main::Dumper ($seq); unless (defined ($seq)) @@ -121,20 +144,26 @@ returns that keys value, if present, otherwise, undef. sub lookup { my $obj= shift; - my $id_str= shift; - - # print "lookup [$id_str]\n"; - my @r= $obj->{'hasher'}->check_file ($id_str, 0); - # print "id_str=[$id_str] r=", main::Dumper (\@r); - my ($rc, $path)= @r; - - my $fnm= $path . '/' . $id_str . '.json'; - # print "description: [$fnm]\n"; + my $search= shift; - my @st= stat ($fnm); - return undef unless (@st); + my $be= $obj->{'cfg'}->{'backend'}; + print "lookup [$search] be=[$be]\n"; + print main::Dumper ($search); - my $reg= TA::Util::slurp_file ($fnm, 'json'); + my $reg; + if ($be eq 'TA::Hasher') + { + my $id_str= $search->{$obj->{'key'}}; + my ($all_reg, $fnm)= $obj->ta_retrieve ($id_str, 0); + print "fnm=[$fnm] all_reg: ", main::Dumper ($all_reg); + return undef unless (defined ($all_reg)); + ($reg)= ta_match ($all_reg, $search); + } + elsif ($be eq 'MongoDB') + { + $reg= $obj->{'_col'}->find_one ( $search ); + } + # print "reg: ", main::Dumper ($reg); return $reg; } @@ -142,25 +171,54 @@ sub lookup sub save { my $obj= shift; - my $id_str= shift; + my $search= shift; my $new_reg= shift; - print "save [$id_str]\n"; - my @r= $obj->{'hasher'}->check_file ($id_str, 1); - # print "id_str=[$id_str] r=", main::Dumper (\@r); - my ($rc, $path)= @r; + my $be= $obj->{'cfg'}->{'backend'}; + print "save [$new_reg] be=[$be]\n"; + print main::Dumper ($new_reg); + if ($be eq 'TA::Hasher') + { + my $id_str= $search->{$obj->{'key'}}; + my ($all_reg, $fnm)= $obj->ta_retrieve ($id_str, 1); - my $fnm= $path . '/' . $id_str . '.json'; - # print "description: [$fnm]\n"; +=begin comment - my @st= stat ($fnm); - unless (@st) - { # TODO: increment sequence and toc - } + my @st= stat ($fnm); + unless (@st) + { # TODO: increment sequence and toc + } + +=end comment +=cut - my $j= encode_json ($new_reg); - # print "generated json: [$j]\n"; - open (J, '>:utf8', $fnm); print J $j; close (J); + if (defined ($all_reg)) + { + my ($reg, $idx)= ta_match ($all_reg, $search); + if (defined ($reg)) + { + $all_reg->{'entries'}->[$idx]= $new_reg; + } + else + { + push (@{$all_reg->{'entries'}}, $new_reg); + } + } + else + { + $all_reg= { 'key' => $id_str, 'entries' => [ $new_reg ] } + } + + my $j= encode_json ($all_reg); + print "fnm=[$fnm]\n"; + print "generated json: [$j]\n"; + open (J, '>:utf8', $fnm); print J $j; close (J); + } + elsif ($be eq 'MongoDB') + { + print "new_reg: ", main::Dumper ($new_reg); + $obj->{'_col'}->insert ($new_reg); + } } =head1 TOC: Table of Contents @@ -385,7 +443,99 @@ sub next_seq $seq->{'seq'}; } -# =head1 INTERNAL FUNCTIONS +=head1 INTERNAL METHODS + +=head2 $mongo_collection= $obj->connect_MongoDB ($config); + +Connect to MongoDB with connection parameters in hash_ref $config and +returns the MongoDB collection object. + +$config needs the following attribues: host, db, user, pass, collection + +=cut + +sub connect_MongoDB +{ + my $obj= shift; + my $cfg= shift; + + my $cmm= $cfg->{'MongoDB'}; + print "cmm: ", main::Dumper ($cmm); + + my $col; + eval + { + my $connection= MongoDB::Connection->new(host => $cmm->{'host'}); + $connection->authenticate($cmm->{'db'}, $cmm->{'user'}, $cmm->{'pass'}); + my $db= $connection->get_database($cmm->{'db'}); + $col= $db->get_collection($cmm->{'collection'}); + print "col: [$col]\n"; + }; + if ($@) + { + print "ATTN: can't connect to MongoDB ", (join ('/', map { $cmm->{$_} } qw(host user collection))), "\n"; + return undef; + } + + return $obj->{'_col'}= $col; +} + +=head2 ($data, $fnm)= $objreg->ta_retrieve ($key, $create) + +Retrieve and return data referenced by $key and returns path name of +that file. If $create is true, the path leading to that file is created, +when it is not already present. + +=cut + +sub ta_retrieve +{ + my $obj= shift; + my $id_str= shift; + my $create= shift; + + my @r= $obj->{'hasher'}->check_file ($id_str, $create); + # print "id_str=[$id_str] r=", main::Dumper (\@r); + my ($rc, $path)= @r; + + my $fnm= $path . '/' . $id_str . '.json'; + # print "description: [$fnm]\n"; + + my @st= stat ($fnm); + return (undef, $fnm) unless (@st); + + my $all_reg= TA::Util::slurp_file ($fnm, 'json'); + + return ($all_reg, $fnm); +} + +=head1 INTERNAL FUNCTIONS + +=head2 ($entry, $index)= ta_match ($data, $search) + +Select first $entry from $data that matches hash ref $search. + +=cut + +sub ta_match +{ + my $all_reg= shift; + my $search= shift; + + my @k= keys $search; + my @e= @{$all_reg->{'entries'}}; + REG: for (my $i= 0; $i <= $#e; $i++) + { + my $reg= $e[$i]; + foreach my $k (@k) + { + next REG unless ($reg->{$k} eq $search->{$k}); + } + print "found match: ", main::Dumper ($reg); + return ($reg, $i); + } + return (undef, 0); +} 1; __END__ diff --git a/textarchive/vlib001.pl b/textarchive/vlib001.pl index 7705021..b71b348 100755 --- a/textarchive/vlib001.pl +++ b/textarchive/vlib001.pl @@ -77,7 +77,7 @@ unless (defined ($project)) } # &usage ('no store specified') unless (defined ($store)); -my $objreg= new TA::ObjReg ('project' => $project, 'store' => $store); +my $objreg= new TA::ObjReg ('project' => $project, 'store' => $store, 'key' => 'md5'); &usage ('no config found') unless (defined ($objreg)); print "objreg: ", Dumper ($objreg) if ($DEBUG || $STOP); exit if ($STOP); @@ -171,48 +171,39 @@ sub process_file my $xdata= { + # 'key' => $md5, 'key_type' => 'md5', + 'store' => $store, 'c_size' => $size, 'path' => $path, 'md5' => $md5, 'mtime' => $st[9], 'fs_size' => $st[7], 'ino' => $st[1] }; - my $reg= $objreg->lookup ($md5); + my $search= { 'md5' => $md5, 'store' => $store, 'path' => $path }; + my $reg= $objreg->lookup ($search); + print __LINE__, " reg: ", Dumper ($reg); my @upd; my $ydata; # pointer to file catalog data within main datastructure if (defined ($reg)) { # we know something about this key value but not in respect to the repository at hand # print "json read: ", main::Dumper ($reg); - my $sb; - if (defined ($sb= $reg->{'store'}->{$store}) - && exists ($sb->{'path'}) - && defined ($ydata= $sb->{'path'}->{$path}) # we need to keep track of the path as well otherwise we can't handly duplicates in the same store - && $st[7] == $ydata->{'fs_size'} - && $st[9] == $ydata->{'mtime'} - ) - { # compare stored and current information and update if necessary foreach my $an (keys %$xdata) { - unless ($ydata->{$an} eq $xdata->{$an}) + unless ($reg->{$an} eq $xdata->{$an}) { - $ydata->{$an}= $xdata->{$an}; + $reg->{$an}= $xdata->{$an}; push (@upd, $an); } } - } - else - { - $reg->{'store'}->{$store}->{'path'}->{$path}= $ydata= $xdata; - push (@upd, 'store upd'); - } } else { # this key is new, so we simply place what we know in the newly created registry item - $reg= { 'key' => $md5, 'key_type' => 'md5', 'store' => { $store => { 'path' => { $path => $ydata= $xdata } } } }; + # $reg= { 'key' => $md5, 'key_type' => 'md5', 'store' => { $store => $ydata= $xdata } }; + $reg= $xdata; push (@upd, 'new key'); } # fill in some more information about that file - if (!exists ($ydata->{'fileinfo'}) || $refresh_fileinfo) + if (!exists ($reg->{'fileinfo'}) || $refresh_fileinfo) { my $xpath= $path; $xpath=~ s#'#'\\''#g; @@ -220,7 +211,7 @@ sub process_file chop ($res); my ($xpath, $fileinfo)= split (/: */, $res, 2); - $ydata->{'fileinfo'}= $fileinfo; + $reg->{'fileinfo'}= $fileinfo; push (@upd, 'fileinfo updated'); } @@ -231,7 +222,7 @@ sub process_file { print "saving (", join ('|', @upd), ")\n"; # print __LINE__, " reg: ", Dumper ($reg); - $objreg->save ($md5, $reg); + $objreg->save ($search, $reg); } (wantarray) ? @upd : \@upd; -- GitLab