From 3059a218c4a3551b5d80898bff9be1bba3458cf1 Mon Sep 17 00:00:00 2001 From: Gerhard Gonter <ggonter@gmail.com> Date: Thu, 23 Mar 2023 18:06:17 +0100 Subject: [PATCH] initial version of eod catalog checker --- eod1.pl | 343 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 343 insertions(+) create mode 100755 eod1.pl diff --git a/eod1.pl b/eod1.pl new file mode 100755 index 0000000..179c234 --- /dev/null +++ b/eod1.pl @@ -0,0 +1,343 @@ +#!/usr/bin/perl + +=head1 NAME + + eod1.pl + +=head1 DESCRIPTION + +=cut + +use strict; + +use Data::Dumper; +$Data::Dumper::Indent= 1; +$Data::Dumper::Sortkeys= 1; + +use FileHandle; + +use utf8; +binmode( STDIN, ':utf8' ); +binmode( STDOUT, ':utf8' ); autoflush STDOUT 1; +binmode( STDERR, ':utf8' ); autoflush STDERR 1; + +use Util::ts; +use Util::JSON; +use MongoDB; +use Util::MongoDB; + +use lib 'lib'; + +use Alma::MARC_Extractor; +use IRMA::NA; + +my $agent_config_file= '/etc/irma/eodagent.json'; + +my @PARS; +my $arg; +while (defined ($arg= shift (@ARGV))) +{ + if ($arg eq '-') { push (@PARS, '-'); } + elsif ($arg eq '--') { push (@PARS, @ARGV); @ARGV= (); } + elsif ($arg =~ /^--(.+)/) + { + my ($opt, $val)= split ('=', $1, 2); + if ($opt eq 'help') { usage(); } + else { usage(); } + } + elsif ($arg =~ /^-(.+)/) + { + foreach my $opt (split ('', $1)) + { + if ($opt eq 'h') { usage(); exit (0); } + # elsif ($opt eq 'x') { $x_flag= 1; } + else { usage(); } + } + } + else + { + push (@PARS, $arg); + } +} + +print join (' ', __FILE__, __LINE__, 'caller=['. caller() . ']'), "\n"; + +my $agent_cnf= Util::JSON::read_json_file ($agent_config_file); +# print __LINE__, " agent_cnf: ", main::Dumper ($agent_cnf); exit(0); + +# Step 1: get list of books from inventory +# get handles for various databases +my $inv_db= IRMA::db::get_any_db($agent_cnf, 'inventory_database'); +my $foxml_col= $inv_db->get_collection('foxml.data'); +# print __LINE__, " foxml_col: ", Dumper($foxml_col); +print __LINE__, " foxml_col=[$foxml_col]\n"; + +my @foxml_columns= qw(ownerId state model ac_number aleph_url pid); +# my %foxml_columns= map { $_ => 1 } @foxml_columns; +my $search= { ownerId => 'ondemae7', state => 'Active', model => 'Book' }; + +my %counters; +my $cur= $foxml_col->find( $search ); +# print __LINE__, ' cur: ', Dumper($cur); +my %ac_numbers= (); +my %pids= (); +my @books= (); +while (my $rec= $cur->next()) +{ + # print __LINE__, " rec: ", Dumper($rec); + my %book= map { $_ => $rec->{$_} } @foxml_columns; + # print __LINE__, " book: ", Dumper(\%book); + + $book{phaidra_url}= 'https://phaidra.univie.ac.at/'. $book{pid}; + push (@{$ac_numbers{$book{ac_number}}} => \%book); + $pids{$book{pid}}= \%book; + + push (@books, \%book); +} + +print __LINE__, " checking for duplicate ac_numbers\n"; +foreach my $ac_number (keys %ac_numbers) +{ + my $x= $ac_numbers{$ac_number}; + # print __LINE__, " ac_number=[$ac_number] pids=[", join(', ', map { $_->{pid} } @$x), "]\n"; + if (@$x != 1) + { # either this is a duplicate or a member of a collection (ZS) where the Alma record should point to the collection instead + print __LINE__, " duplicate_ac_number=[$ac_number] pids=[", join(', ', map { $_->{pid} } @$x), "]\n"; + $counters{duplicate_ac_number}++; + + # TODO: find out, why this is a duplicate, possibly do not mark the "canonical" version which should be identified in a ticket or so + # for now, just leave them out of further processing. + foreach my $book (@$x) + { + $book->{ac_number_note}= 'dup'; + } + } +} + +my $book_count= @books; +print __LINE__, " found $book_count books\n"; + +# BEGIN step 2: check IRMA records +my $irma_db= IRMA::db::get_any_db($agent_cnf, 'irma_database'); +my $irma_col= $irma_db->get_collection('irma.map'); + +# BEGIN step 3: check Alma records +my $marc_db= IRMA::db::get_any_db($agent_cnf, 'marc_database'); +my $marc_col= $marc_db->get_collection('alma.marc'); +my $req_col= $marc_db->get_collection('requests'); + + my @marc_mex_fields= qw(df_doi val_doi df_phaidra val_phaidra df_urn val_urn ts_marc); + my @marc_fields= qw(ac_number mms_id fetched lib_code); + my @marc_extra_fields= qw(marc_record ts_fetched); + my $mex= Alma::MARC_Extractor->new(\@marc_fields); + $mex->{mex_ot2ut}= 0; $mex->{mex_phaidra}= 1; + +my @books_duplicate_ac_number; +my @books_problems; +my @books_ok; +BOOK3: foreach my $book (@books) +{ + my @irma_notes= (); + my ($ac_number, $pid)= map { $book->{$_} } qw(ac_number pid); + + unless (defined ($ac_number) && $ac_number =~ m#^AC\d{8}$#) + { + $book->{problem}= "invalid ac_number=[$ac_number]"; + push (@books_problems, $book); + $book->{problem}= 'invalid_ac_number'; + $counters{invalid_ac_number}++; + next BOOK3; + } + + my @irma_records= $irma_col->find({ ac_number => $ac_number })->all(); + print __LINE__, " ac_number=[$ac_number] irma_records: ", scalar @irma_records, ' ', Dumper(\@irma_records); + + my $marc= $marc_col->find_one({ ac_number => $ac_number }); + # print __LINE__, " marc: ", Dumper($marc); + unless (defined ($marc)) + { + $counters{missing_marc_record}++; + $book->{problem}= 'missing_marc_record'; + push (@books_problems, $book); + next BOOK3; + } + + if ($book->{ac_number_note}) + { + $counters{has_ac_number_note}++; + push (@books_duplicate_ac_number, $book); # save for later, see above near the duplicate check + next BOOK3; + } + + my @alma_notes= (); + $book->{ts_fetched}= Util::ts::ts_ISO_gmt($marc->{fetched}); + $mex->extract_identifiers2($marc, $book); + if (exists($book->{mex})) + { + my $mex= $book->{mex}; + print __LINE__, " ac_number=[$ac_number] mex: ", Dumper($mex); + foreach my $mf (keys %$mex) + { + my $mfa= $mex->{$mf}; + + if (@$mfa != 1) + { # this should be inspected + print __LINE__, " ac_number=[$ac_number] multiple entries for $mf mex: ", Dumper($mex); + $counters{multiple_mex_entries}++; + $book->{problems}= 'multiple_mex_entries'; + push (@books_problems, $book); + next BOOK3; + } + + foreach my $mfe (@$mfa) + { + my ($df, $val)= @$mfe; + my $copy= 0; + if ($mf eq 'phaidra') + { + if ($val =~ m#^https?://phaidra.univie.ac.at/(.*)(o:\d+)$#) + { + $copy= 1; # only transcribe those URLs that really look like some + my ($extra, $found_pid)= ($1, $2); + my $correct_url= 'https://phaidra.univie.ac.at/'. $found_pid; + if ($correct_url ne $book->{phaidra_url} || $extra ne '') + { + $book->{update_phaidra_url}= $book->{phaidra_url}; + $counters{update_phaidra_url}++; + push(@alma_notes, 'update_phaidra_url'); + } + } + } + elsif ($mf eq 'hdl') + { + if ($val =~ m#^11353/10\.(\d+)$#) + { + $copy= 1; # only transcribe those URLs that really look like some + my $found_pid= 'o:'.$1; + if ($found_pid ne $pid) + { # TODO: + $book->{update_hdl}= $book->{hdl}; + $counters{update_hdl}++; + push(@alma_notes, 'update_hdl'); + } + } + } + elsif ($mf eq 'doi') + { + if ($val =~ m#^10.25365/digital-copy\.(\d+)$#) + { + $copy= 1; # only transcribe those URLs that really look like some +# TODO: DOI checken +# if ($found_pid ne $pid) +# { # TODO: +# $book->{update_hdl}= $book->{hdl}; +# $counters{update_hdl}++; +# push(@alma_notes, 'update_hdl'); +# } + } + } + elsif ($mf eq 'urn') + { + if ($val =~ m#^urn:nbn:at:at-ubw-\d{5}\.\d{5}\.\d{6}-\d$#) + { + $copy= 1; # only transcribe those URLs that really look like some + # TODO: check if this is the known nbn! + } + } + + + if ($copy) + { + $book->{'df_'.$mf}= $df; + $book->{'val_'.$mf}= $val; + } + else + { + push(@alma_notes, "junk_${mf}_in_marc"); + print __LINE__, " ATTN: ac_number=[$ac_number] pid=[$pid] junk data mf=[$mf] in marc record: df=[$df] val=[$val]\n"; + } + } + } + } + else + { + # this is ok, not a bug, Alma simply does not know anything about this phaidra object + print __LINE__, " ac_number=[$ac_number] mex missing\n"; + } + + unless (exists($book->{df_phaidra})) + { # TODO: if there is no phaidra_url at all, set it... + $counters{set_phaidra_url}++; + $book->{update_phaidra_url}= $book->{phaidra_url}; + push(@alma_notes, 'set_phaidra_url'); + } + + push (@alma_notes, 'ok') unless (@alma_notes); + $book->{alma_notes}= join(',', @alma_notes); + push (@books_ok, $book); + # print __LINE__, " book: ", Dumper($book); + $counters{ok}++; +} +# END check Alma records + +print __LINE__, " counters: ", Dumper(\%counters); + +Util::JSON::write_json_file('eod_problems.json', \@books_problems); + +my @tsv_columns= +qw( pid alma_notes ownerId state model ac_number ac_number_note aleph_url + marc_record ts_fetched fetched ts_marc mms_id lib_code + df_doi val_doi + df_urn val_urn + df_hdl val_hdl update_hdl + df_phaidra val_phaidra phaidra_url update_phaidra_url + ); + +write_tsv_file('eod_data.tsv', \@books_ok); +write_tsv_file('duplicate_eod_data.tsv', \@books_duplicate_ac_number); + +sub write_tsv_file +{ + my $tsv_filename= shift; + my $book_list= shift; + + open (TSV, '>:utf8', $tsv_filename) or die "can't write to $tsv_filename"; + my $count_books_ok= @$book_list; + print "saving $count_books_ok to $tsv_filename\n"; + print TSV join("\t", @tsv_columns), "\n"; + BOOKx: foreach my $book (@$book_list) + { + print TSV join("\t", map { $book->{$_} } @tsv_columns), "\n"; + } + close (TSV); +} + +__END__ + +=head1 EXAMPLES + +=head2 AC02901724 + +-[ RECORD 1 ]---------------------- +pid | o:90496 +alma_notes | update_phaidra_url +ownerId | ondemae7 +state | Active +model | Book +ac_number | AC02901724 +aleph_url | http://aleph.univie.ac.at/F?func=find-c&ccl_term=AC02901724 +marc_record | marc_data_found +ts_fetched | 2023-03-18T173800 +fetched | 1679161080 +ts_marc | 20230303140010.0 +mms_id | 990013785100203332 +lib_code | 43ACC_UBW +df_phaidra | 856:4: :u +val_phaidra | http://phaidra.univie.ac.at/o:90495 +phaidra_url | https://phaidra.univie.ac.at/o:90496 +update_phaidra_url | https://phaidra.univie.ac.at/o:90496 + +o:90495 is a collection, o:90496 is the only member; this is a ZS record, +so this is ok and the suggested change is not correct. + + -- GitLab