diff --git a/textarchive/lib/TA/ObjReg.pm b/textarchive/lib/TA/ObjReg.pm index 77f65719e7700f70b85226c06afcbbfec8d82f2f..ea6bf215515fb911b787e546fae33af4b32df5ab 100644 --- a/textarchive/lib/TA/ObjReg.pm +++ b/textarchive/lib/TA/ObjReg.pm @@ -623,7 +623,7 @@ sub remove_from_store my $objreg= shift; my $store= shift; my $drop_list= shift; # array ref containing entries: [ $md5, $path ] - # TODO: maybe a more universial format could be useful + # TODO: maybe a more universal format could be useful my $be= $objreg->{'cfg'}->{'backend'}; if ($be eq 'TA::Hasher') @@ -684,6 +684,185 @@ sub remove_store =end comment =cut +sub check_policy +{ + my $objreg= shift; + + # print __LINE__, " objreg: ", main::Dumper ($objreg); + + my ($be, $policy)= map { $objreg->{'cfg'}->{$_} } qw(backend policy); + unless ($be eq 'MongoDB') + { + print "not implemented for backend [$be]\n"; + return undef; + } + + unless (defined ($policy)) + { + print "no policy defined\n"; + return undef; + } + + print __LINE__, " policy: ", main::Dumper ($policy); + my ($key, $rs_list, $check_list, $ign_keys, $ign_paths)= map { $policy->{$_} } qw(key replica_sets check ignore_key ignore_path_pattern); + my @check_list= @$check_list; + my %ign_keys; %ign_keys= map { $_ => 1 } @$ign_keys if (defined ($ign_keys)); + my $replica_map= get_replica_map ($rs_list); + +# my $MIN_SIZE= 2_000_000_000; +my $MIN_SIZE= 200_000_000; + my $cursor= $objreg->{'_cat'}->find ( { fs_size => { '$gt' => $MIN_SIZE } } ); + + my %items= (); + my $item_count= 0; + ITEM: while (my $item= $cursor->next()) + { + my $kv= $item->{$key}; + next ITEM if (exists ($ign_keys{$kv})); + + my $path= $item->{path}; + foreach my $ign_path (@$ign_paths) + { + next ITEM if ($path =~ m#$ign_path#); + } + + # print __LINE__, " key=[$key] kv=[$kv] item: ", main::Dumper ($item); + + my $rec= $items{$kv}; + if (defined ($rec)) + { + # print "duplicate key...\n"; + my $mismatch= 0; + foreach my $an (@check_list) { $mismatch++ if ($rec->{$an} ne $item->{$an}); } + + if ($mismatch) + { + print "item mismatch on key=[$key]: rec: ", main::Dumper ($rec), "item: ", main::Dumper ($item); + next ITEM; + } + } + else + { + my %rec= map { $_ => $item->{$_} } (@check_list, qw(fileinfo)); + $rec= $items{$kv}= \%rec; + } + + push (@{$rec->{stores}->{$item->{store}}}, $path); + $item_count++; + } + print __LINE__, " item_count: $item_count\n"; + # print __LINE__, " items: ", main::Dumper (\%items); + + foreach my $kv (keys %items) + { + my $item= $items{$kv}; + check_replication_policy ($replica_map, $item); + } +} + +sub get_replica_map +{ + my $rs_list= shift; + print __LINE__, " rs_list: ", main::Dumper ($rs_list); + + my $map= + { + stores => {}, + store_count => {}, + }; + + foreach my $rs (keys %$rs_list) + { + foreach my $store (@{$rs_list->{$rs}}) + { + $map->{stores}->{$store}= $rs; + $map->{store_count}->{$rs}++; + } + } + + print __LINE__, " map: ", main::Dumper ($map); + $map; +} + +sub check_replication_policy +{ + my $map= shift; + my $item= shift; + + # print __LINE__, ' item: ', main::Dumper ($item); + # print __LINE__, ' map: ', main::Dumper ($map); + my @diag; + + my %replica_sets; + my $stores= $item->{stores}; + foreach my $store (keys %$stores) + { + # print __LINE__, " store=[$store]\n"; + my @paths= $stores->{$store}; + if (@paths > 1) + { + push (@diag, [ 'store_duplicate', $store ]); + } + + if (exists ($map->{stores}->{$store})) + { + my $set= $map->{stores}->{$store}; + $replica_sets{$set}++; + } + else + { + push (@diag, [ 'store_not_in_replica_set', $store ]); + } + } + + my $prefered_replica_set; + my @replica_sets= sort keys %replica_sets; + if (@replica_sets == 0) { push (@diag, [ 'no_replica_set' ]); } + elsif (@replica_sets > 1) { push (@diag, [ 'multiple_replica_sets', join (' ', @replica_sets) ]); } + elsif (@replica_sets == 1) + { # NOTE: the object is in one replica set, now check, if it is present in all stores; + + # TODO: it should be possible to have say 10 stores in one replica set + # and specify that an object must be present in at least 3 of them. + # Right now, the map does not contain this info. Maybe the config + # should specify the real # map and not simple lists of replica sets ... + + # TODO: the replication map should also encode geographically + # distributed replicas. E.g. have at least 3 replicas on different + # locations + + $prefered_replica_set= $replica_sets[0]; + + if ($replica_sets{$prefered_replica_set} != $map->{store_count}->{$prefered_replica_set}) + { + my @missing; + my $ms= $map->{stores}; + foreach my $store (sort keys %$ms) + { + push (@missing, $store) if ($ms->{$store} eq $prefered_replica_set && !exists ($stores->{$store})); + } + + push (@diag, [ 'replica_set_incomplete', join (' ', @missing) ]); + } + } + + # NOTE: there are different kings of problems: + # * an object with too few replicas in a given replica set is a problem + # * an object with all replicas but with extra copies is not really a problem, but should be noted + + if (@diag) + { + print "ATTN: replication policy problem; prefered_replica_set=[$prefered_replica_set]\n"; + print join ("\n", map { 'NOTE: ' . join (' ', @$_) } @diag), "\n"; + print __LINE__, ' item: ', main::Dumper ($item); + } + + { + prefered_replica_set => $prefered_replica_set, + diag => \@diag, + } +} + =head1 sequence number =head2 $reg->next_seq() diff --git a/textarchive/vlib001.pl b/textarchive/vlib001.pl index 2be5617ea7a21c461515228f7eafdf345aad58fa..326f185767ecf124c713265c75c3f48737a896c9 100755 --- a/textarchive/vlib001.pl +++ b/textarchive/vlib001.pl @@ -104,7 +104,7 @@ while (my $arg= shift (@ARGV)) elsif ($opt eq 'noinode') { $check_inode= 0; } elsif ($opt eq 'subdir') { push (@subdirs, $val || shift (@ARGV)); } elsif ($opt eq 'cd') { $cd_mode= 1; } - elsif ($arg =~ /^--(refresh|verify|lookup|edit|maint|next-seq|get-cat)$/) { $op_mode= $1; } + elsif ($arg =~ /^--(refresh|verify|lookup|edit|maint|next-seq|get-cat|policy)$/) { $op_mode= $1; } else { &usage ("unknown option '$arg'"); } } elsif ($arg =~ /^-/) @@ -253,6 +253,13 @@ elsif ($op_mode eq 'next-seq') my $x= $objreg->next_seq (); print "x: ", Dumper ($x); } +elsif ($op_mode eq 'policy') +{ + # my $catalog= $objreg->{'cfg'}->{'catalog'}; + # &usage ('no catalog found in config') unless (defined ($catalog)); + + $objreg->check_policy (); +} # print "objreg: (after refresh)", Dumper ($objreg);