diff --git a/textarchive/lib/TA/Hasher.pm b/textarchive/lib/TA/Hasher.pm index ca09cfe3f18ff5c0de4d505ad4c76bdcbad973e7..2025cb03282f2ad7df7983827a3141d3e6599c21 100644 --- a/textarchive/lib/TA/Hasher.pm +++ b/textarchive/lib/TA/Hasher.pm @@ -9,7 +9,7 @@ package TA::Hasher; use Data::Dumper; $Data::Dumper::Indent= 1; -my %known_algorithms= map { $_ => 1 } qw(NULL S3C1L S3C2L P3C3L); +my %known_algorithms= map { $_ => 1 } qw(NULL S3C1L S3C2L P3C3L P3C2L P3C1L); sub new { @@ -43,6 +43,14 @@ sub new { $obj->{'mkpo'}= \&TA::Hasher::P3C3L::mkpo; } + elsif ($algorithm eq 'P3C2L') + { + $obj->{'mkpo'}= \&TA::Hasher::P3C2L::mkpo; + } + elsif ($algorithm eq 'P3C1L') + { + $obj->{'mkpo'}= \&TA::Hasher::P3C1L::mkpo; + } else { $obj->{'mkpo'}= \&TA::Hasher::NULL::mkpo; @@ -173,6 +181,42 @@ sub mkpo return { 'L' => \@L }; } +package TA::Hasher::P3C2L; + +sub mkpo +{ + my $S= shift; + + return undef unless (defined ($S)); + + my @L; + + if ($S =~ m#^(...)(.{1,3})#) + { @L= ( $1, $2 ); } + elsif ($S =~ m#^(.{1,3})#) + { @L= ( $1, 'ZZZ' ); } + else { @L= ( $S, 'ZZZ' ); } + + return { 'L' => \@L }; +} + +package TA::Hasher::P3C1L; + +sub mkpo +{ + my $S= shift; + + return undef unless (defined ($S)); + + my @L; + + if ($S =~ m#^(.{1,3})#) + { @L= ( $1 ); } + else { @L= ( $S ); } + + return { 'L' => \@L }; +} + 1; __END__ diff --git a/textarchive/lib/TA/ObjReg.pm b/textarchive/lib/TA/ObjReg.pm index e851ad49e6aa8eb8924732aa168f0844896a5077..827acd5dc9b9eb22cdfcfeac26588619051769de 100644 --- a/textarchive/lib/TA/ObjReg.pm +++ b/textarchive/lib/TA/ObjReg.pm @@ -15,7 +15,7 @@ sub new # check the presence of all required parameters my $stopit= 0; - foreach my $k (qw(project subproject)) + foreach my $k (qw(project store)) { unless (exists ($par{$k})) { @@ -69,16 +69,16 @@ sub lookup my $obj= shift; my $id_str= shift; - print "lookup [$id_str]\n"; + # print "lookup [$id_str]\n"; my @r= $obj->{'hasher'}->check_file ($id_str, 0); # print "id_str=[$id_str] r=", main::Dumper (\@r); my ($rc, $path)= @r; my $fnm= $path . '/' . $id_str . '.json'; - print "description: [$fnm]\n"; + # print "description: [$fnm]\n"; my @st= stat ($fnm); - return undef unless (defined (@st)); + return undef unless (@st); my $reg= TA::Util::slurp_file ($fnm, 'json'); @@ -97,10 +97,10 @@ sub save my ($rc, $path)= @r; my $fnm= $path . '/' . $id_str . '.json'; - print "description: [$fnm]\n"; + # print "description: [$fnm]\n"; my $j= encode_json ($new_reg); - print "generated json: [$j]\n"; + # print "generated json: [$j]\n"; open (J, '>:utf8', $fnm); print J $j; close (J); } diff --git a/textarchive/lib/TA/Util.pm b/textarchive/lib/TA/Util.pm index 820cbb44472e94c20d772126f5be7905f9c1db7c..69e8e4f73126c5e4c5441a30e4e883b38d0bc4c3 100644 --- a/textarchive/lib/TA/Util.pm +++ b/textarchive/lib/TA/Util.pm @@ -16,7 +16,7 @@ sub slurp_file my $fnm= shift; my $format= shift || 'lines'; - open (FI, $fnm) or return undef; + open (FI, '<:utf8', $fnm) or return undef; my @lines= <FI>; close (FI); diff --git a/textarchive/vlib001.pl b/textarchive/vlib001.pl index 9159da15d4f5092bc3cbe341182fd99d00c00f21..ee132dc8f53041e1fc1b87b9c09501a19919fd08 100755 --- a/textarchive/vlib001.pl +++ b/textarchive/vlib001.pl @@ -11,8 +11,9 @@ use TA::ObjReg; my @PAR= (); my $project; -my $subproject; +my $store= '<none>'; my $refresh_fileinfo= 0; +my $DEBUG= 0; while (my $arg= shift (@ARGV)) { @@ -20,7 +21,7 @@ while (my $arg= shift (@ARGV)) elsif ($arg =~ /^--/) { if ($arg eq '--project') { $project= shift (@ARGV); } - elsif ($arg eq '--subproject') { $subproject= shift (@ARGV); } + elsif ($arg eq '--store') { $store= shift (@ARGV); } elsif ($arg eq '--fileinfo') { $refresh_fileinfo= 1; } } elsif ($arg =~ /^-/) @@ -29,29 +30,37 @@ while (my $arg= shift (@ARGV)) foreach my $a (@a) { if ($a eq 'p') { $project= shift (@ARGV); } - elsif ($a eq 's') { $subproject= shift (@ARGV); } + elsif ($a eq 's') { $store= shift (@ARGV); } + elsif ($a eq 'D') { $DEBUG++; } } } else { push (@PAR, $arg); } } -unless (defined ($project)) -{ - &usage ('no project specified'); -} +print "debug level: $DEBUG\n"; + +&usage ('no project specified') unless (defined ($project)); +# &usage ('no store specified') unless (defined ($store)); -my $objreg= new TA::ObjReg ('project' => $project, 'subproject' => $subproject); -print "objreg: ", Dumper ($objreg); +my $objreg= new TA::ObjReg ('project' => $project, 'store' => $store); +&usage ('no config found') unless (defined ($objreg)); +print "objreg: ", Dumper ($objreg) if ($DEBUG); -my $catalog= $objreg->{'cfg'}->{'catalog'}; -unless (defined ($catalog)) +my $catalog= $objreg->{'cfg'}->{'catalog'}; +&usage ('no catalog found in config') unless (defined ($catalog)); + +my $stores_p= $objreg->{'cfg'}->{'stores'}; +my $store_cfg= $stores_p->{$store}; +unless (defined ($store_cfg)) { - &usage ('no catalog specified'); + print "no store config found for '$store', check these: ", Dumper ($stores_p); + exit (-2); } +print "store_cfg: ", Dumper ($store_cfg) if ($DEBUG); if ($catalog->{'format'} eq 'md5cat') { - refresh_md5cat ($objreg, $subproject); + refresh_md5cat ($objreg, $store); } # print "objreg: (after refresh)", Dumper ($objreg); @@ -61,7 +70,7 @@ exit (0); sub refresh_md5cat { my $objreg= shift; - my $subproject= shift; + my $store= shift; my %extra= @_; # my $catalog= $objreg->{'cfg'}->{'catalog'}; @@ -69,33 +78,49 @@ sub refresh_md5cat # my $hasher= $objreg->{'hasher'}; - open (CAT, '_catalog') or die "cant read catalog"; + open (CAT, '<:utf8', '_catalog') or die "cant read catalog"; + my $cnt_processed= 0; + my $cnt_updated= 0; CAT: while (<CAT>) { chop; my ($md5, $xf, $size, $path)= split (' ', $_, 4); $path=~ s#^\.\/##; - print "md5=[$md5] size=[$size] path=[$path]\n"; - my @st= stat ($path); + # print "md5=[$md5] size=[$size] path=[$path]\n"; + + $cnt_processed++; + my @upd= process_file ($md5, $path, $size); + $cnt_updated++ if (@upd); + } + close (CAT); + printf ("%6d files processed; %6d files updated\n", $cnt_processed, $cnt_updated); +} +sub process_file +{ + my ($md5, $path, $size)= @_; + + my @st= stat ($path); unless (@st) { print STDERR "ATTN: could not stat file '$path'\n"; - next CAT; + return undef; } my $xdata= { 'c_size' => $size, 'path' => $path, 'mtime' => $st[9], 'fs_size' => $st[7] }; - # $objreg->add ($md5, 0, { 'md5' => $md5, $subproject => { 'size' => $size, 'path' => $path, %extra } } ); my $reg= $objreg->lookup ($md5); - print "json read: ", main::Dumper ($reg); - my $upd= 0; + my @upd; my $ydata; # pointer to file catalog data within main datastructure if (defined ($reg)) { # we know something about this hash value but not in respect to the repository at hand - if (defined ($ydata= $reg->{$subproject}) + # print "json read: ", main::Dumper ($reg); + my $sb; + if (defined ($sb= $reg->{'store'}->{$store}) + && exists ($sb->{'path'}) + && defined ($ydata= $sb->{'path'}->{$path}) # we need to keep track of the path as well otherwise we can't handly duplicates in the same store && $st[7] == $ydata->{'fs_size'} && $st[9] == $ydata->{'mtime'} ) @@ -103,33 +128,47 @@ sub refresh_md5cat } else { - $ydata= $reg->{$subproject}= $xdata; - $upd= 1; + # print "st: fs_size(7)=[$st[7]] mtime(9)=[$st[9]]\n"; + # print "ydata: ", Dumper ($ydata); + # print "xdata: ", Dumper ($xdata); + + $reg->{'store'}->{$store}->{'path'}->{$path}= $ydata= $xdata; + # print __LINE__, " reg: ", Dumper ($reg); + # print "ydata: ", Dumper ($ydata); + # print "xdata: ", Dumper ($xdata); + + push (@upd, 'store upd'); } } else { - $reg= { 'md5' => $md5, $subproject => $ydata= $xdata }; - $upd= 1; + $reg= { 'md5' => $md5, 'store' => { $store => { 'path' => { $path => $ydata= $xdata } } } }; + push (@upd, 'new md5'); } # fill in some more information about that file if (!exists ($ydata->{'fileinfo'}) || $refresh_fileinfo) { - my $res= `/usr/bin/file '$path'`; + my $xpath= $path; + $xpath=~ s#'#'\\''#g; + my $res= `/usr/bin/file '$xpath'`; chop ($res); my ($xpath, $fileinfo)= split (/: */, $res, 2); $ydata->{'fileinfo'}= $fileinfo; - $upd= 1; + push (@upd, 'fileinfo updated'); } # TODO: some more information would probably be nice as well # e.g. mp3info or stuff - $objreg->save ($md5, $reg) if ($upd); + if (@upd) + { + print "saving (", join ('|', @upd), ")\n"; + # print __LINE__, " reg: ", Dumper ($reg); + $objreg->save ($md5, $reg); } - close (CAT); -} + (wantarray) ? @upd : \@upd; +} sub usage {