Skip to content
Snippets Groups Projects
Commit 3059a218 authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

initial version of eod catalog checker

parent 37d62ee1
No related branches found
No related tags found
No related merge requests found
eod1.pl 0 → 100755
#!/usr/bin/perl
=head1 NAME
eod1.pl
=head1 DESCRIPTION
=cut
use strict;
use Data::Dumper;
$Data::Dumper::Indent= 1;
$Data::Dumper::Sortkeys= 1;
use FileHandle;
use utf8;
binmode( STDIN, ':utf8' );
binmode( STDOUT, ':utf8' ); autoflush STDOUT 1;
binmode( STDERR, ':utf8' ); autoflush STDERR 1;
use Util::ts;
use Util::JSON;
use MongoDB;
use Util::MongoDB;
use lib 'lib';
use Alma::MARC_Extractor;
use IRMA::NA;
my $agent_config_file= '/etc/irma/eodagent.json';
my @PARS;
my $arg;
while (defined ($arg= shift (@ARGV)))
{
if ($arg eq '-') { push (@PARS, '-'); }
elsif ($arg eq '--') { push (@PARS, @ARGV); @ARGV= (); }
elsif ($arg =~ /^--(.+)/)
{
my ($opt, $val)= split ('=', $1, 2);
if ($opt eq 'help') { usage(); }
else { usage(); }
}
elsif ($arg =~ /^-(.+)/)
{
foreach my $opt (split ('', $1))
{
if ($opt eq 'h') { usage(); exit (0); }
# elsif ($opt eq 'x') { $x_flag= 1; }
else { usage(); }
}
}
else
{
push (@PARS, $arg);
}
}
print join (' ', __FILE__, __LINE__, 'caller=['. caller() . ']'), "\n";
my $agent_cnf= Util::JSON::read_json_file ($agent_config_file);
# print __LINE__, " agent_cnf: ", main::Dumper ($agent_cnf); exit(0);
# Step 1: get list of books from inventory
# get handles for various databases
my $inv_db= IRMA::db::get_any_db($agent_cnf, 'inventory_database');
my $foxml_col= $inv_db->get_collection('foxml.data');
# print __LINE__, " foxml_col: ", Dumper($foxml_col);
print __LINE__, " foxml_col=[$foxml_col]\n";
my @foxml_columns= qw(ownerId state model ac_number aleph_url pid);
# my %foxml_columns= map { $_ => 1 } @foxml_columns;
my $search= { ownerId => 'ondemae7', state => 'Active', model => 'Book' };
my %counters;
my $cur= $foxml_col->find( $search );
# print __LINE__, ' cur: ', Dumper($cur);
my %ac_numbers= ();
my %pids= ();
my @books= ();
while (my $rec= $cur->next())
{
# print __LINE__, " rec: ", Dumper($rec);
my %book= map { $_ => $rec->{$_} } @foxml_columns;
# print __LINE__, " book: ", Dumper(\%book);
$book{phaidra_url}= 'https://phaidra.univie.ac.at/'. $book{pid};
push (@{$ac_numbers{$book{ac_number}}} => \%book);
$pids{$book{pid}}= \%book;
push (@books, \%book);
}
print __LINE__, " checking for duplicate ac_numbers\n";
foreach my $ac_number (keys %ac_numbers)
{
my $x= $ac_numbers{$ac_number};
# print __LINE__, " ac_number=[$ac_number] pids=[", join(', ', map { $_->{pid} } @$x), "]\n";
if (@$x != 1)
{ # either this is a duplicate or a member of a collection (ZS) where the Alma record should point to the collection instead
print __LINE__, " duplicate_ac_number=[$ac_number] pids=[", join(', ', map { $_->{pid} } @$x), "]\n";
$counters{duplicate_ac_number}++;
# TODO: find out, why this is a duplicate, possibly do not mark the "canonical" version which should be identified in a ticket or so
# for now, just leave them out of further processing.
foreach my $book (@$x)
{
$book->{ac_number_note}= 'dup';
}
}
}
my $book_count= @books;
print __LINE__, " found $book_count books\n";
# BEGIN step 2: check IRMA records
my $irma_db= IRMA::db::get_any_db($agent_cnf, 'irma_database');
my $irma_col= $irma_db->get_collection('irma.map');
# BEGIN step 3: check Alma records
my $marc_db= IRMA::db::get_any_db($agent_cnf, 'marc_database');
my $marc_col= $marc_db->get_collection('alma.marc');
my $req_col= $marc_db->get_collection('requests');
my @marc_mex_fields= qw(df_doi val_doi df_phaidra val_phaidra df_urn val_urn ts_marc);
my @marc_fields= qw(ac_number mms_id fetched lib_code);
my @marc_extra_fields= qw(marc_record ts_fetched);
my $mex= Alma::MARC_Extractor->new(\@marc_fields);
$mex->{mex_ot2ut}= 0; $mex->{mex_phaidra}= 1;
my @books_duplicate_ac_number;
my @books_problems;
my @books_ok;
BOOK3: foreach my $book (@books)
{
my @irma_notes= ();
my ($ac_number, $pid)= map { $book->{$_} } qw(ac_number pid);
unless (defined ($ac_number) && $ac_number =~ m#^AC\d{8}$#)
{
$book->{problem}= "invalid ac_number=[$ac_number]";
push (@books_problems, $book);
$book->{problem}= 'invalid_ac_number';
$counters{invalid_ac_number}++;
next BOOK3;
}
my @irma_records= $irma_col->find({ ac_number => $ac_number })->all();
print __LINE__, " ac_number=[$ac_number] irma_records: ", scalar @irma_records, ' ', Dumper(\@irma_records);
my $marc= $marc_col->find_one({ ac_number => $ac_number });
# print __LINE__, " marc: ", Dumper($marc);
unless (defined ($marc))
{
$counters{missing_marc_record}++;
$book->{problem}= 'missing_marc_record';
push (@books_problems, $book);
next BOOK3;
}
if ($book->{ac_number_note})
{
$counters{has_ac_number_note}++;
push (@books_duplicate_ac_number, $book); # save for later, see above near the duplicate check
next BOOK3;
}
my @alma_notes= ();
$book->{ts_fetched}= Util::ts::ts_ISO_gmt($marc->{fetched});
$mex->extract_identifiers2($marc, $book);
if (exists($book->{mex}))
{
my $mex= $book->{mex};
print __LINE__, " ac_number=[$ac_number] mex: ", Dumper($mex);
foreach my $mf (keys %$mex)
{
my $mfa= $mex->{$mf};
if (@$mfa != 1)
{ # this should be inspected
print __LINE__, " ac_number=[$ac_number] multiple entries for $mf mex: ", Dumper($mex);
$counters{multiple_mex_entries}++;
$book->{problems}= 'multiple_mex_entries';
push (@books_problems, $book);
next BOOK3;
}
foreach my $mfe (@$mfa)
{
my ($df, $val)= @$mfe;
my $copy= 0;
if ($mf eq 'phaidra')
{
if ($val =~ m#^https?://phaidra.univie.ac.at/(.*)(o:\d+)$#)
{
$copy= 1; # only transcribe those URLs that really look like some
my ($extra, $found_pid)= ($1, $2);
my $correct_url= 'https://phaidra.univie.ac.at/'. $found_pid;
if ($correct_url ne $book->{phaidra_url} || $extra ne '')
{
$book->{update_phaidra_url}= $book->{phaidra_url};
$counters{update_phaidra_url}++;
push(@alma_notes, 'update_phaidra_url');
}
}
}
elsif ($mf eq 'hdl')
{
if ($val =~ m#^11353/10\.(\d+)$#)
{
$copy= 1; # only transcribe those URLs that really look like some
my $found_pid= 'o:'.$1;
if ($found_pid ne $pid)
{ # TODO:
$book->{update_hdl}= $book->{hdl};
$counters{update_hdl}++;
push(@alma_notes, 'update_hdl');
}
}
}
elsif ($mf eq 'doi')
{
if ($val =~ m#^10.25365/digital-copy\.(\d+)$#)
{
$copy= 1; # only transcribe those URLs that really look like some
# TODO: DOI checken
# if ($found_pid ne $pid)
# { # TODO:
# $book->{update_hdl}= $book->{hdl};
# $counters{update_hdl}++;
# push(@alma_notes, 'update_hdl');
# }
}
}
elsif ($mf eq 'urn')
{
if ($val =~ m#^urn:nbn:at:at-ubw-\d{5}\.\d{5}\.\d{6}-\d$#)
{
$copy= 1; # only transcribe those URLs that really look like some
# TODO: check if this is the known nbn!
}
}
if ($copy)
{
$book->{'df_'.$mf}= $df;
$book->{'val_'.$mf}= $val;
}
else
{
push(@alma_notes, "junk_${mf}_in_marc");
print __LINE__, " ATTN: ac_number=[$ac_number] pid=[$pid] junk data mf=[$mf] in marc record: df=[$df] val=[$val]\n";
}
}
}
}
else
{
# this is ok, not a bug, Alma simply does not know anything about this phaidra object
print __LINE__, " ac_number=[$ac_number] mex missing\n";
}
unless (exists($book->{df_phaidra}))
{ # TODO: if there is no phaidra_url at all, set it...
$counters{set_phaidra_url}++;
$book->{update_phaidra_url}= $book->{phaidra_url};
push(@alma_notes, 'set_phaidra_url');
}
push (@alma_notes, 'ok') unless (@alma_notes);
$book->{alma_notes}= join(',', @alma_notes);
push (@books_ok, $book);
# print __LINE__, " book: ", Dumper($book);
$counters{ok}++;
}
# END check Alma records
print __LINE__, " counters: ", Dumper(\%counters);
Util::JSON::write_json_file('eod_problems.json', \@books_problems);
my @tsv_columns=
qw( pid alma_notes ownerId state model ac_number ac_number_note aleph_url
marc_record ts_fetched fetched ts_marc mms_id lib_code
df_doi val_doi
df_urn val_urn
df_hdl val_hdl update_hdl
df_phaidra val_phaidra phaidra_url update_phaidra_url
);
write_tsv_file('eod_data.tsv', \@books_ok);
write_tsv_file('duplicate_eod_data.tsv', \@books_duplicate_ac_number);
sub write_tsv_file
{
my $tsv_filename= shift;
my $book_list= shift;
open (TSV, '>:utf8', $tsv_filename) or die "can't write to $tsv_filename";
my $count_books_ok= @$book_list;
print "saving $count_books_ok to $tsv_filename\n";
print TSV join("\t", @tsv_columns), "\n";
BOOKx: foreach my $book (@$book_list)
{
print TSV join("\t", map { $book->{$_} } @tsv_columns), "\n";
}
close (TSV);
}
__END__
=head1 EXAMPLES
=head2 AC02901724
-[ RECORD 1 ]----------------------
pid | o:90496
alma_notes | update_phaidra_url
ownerId | ondemae7
state | Active
model | Book
ac_number | AC02901724
aleph_url | http://aleph.univie.ac.at/F?func=find-c&ccl_term=AC02901724
marc_record | marc_data_found
ts_fetched | 2023-03-18T173800
fetched | 1679161080
ts_marc | 20230303140010.0
mms_id | 990013785100203332
lib_code | 43ACC_UBW
df_phaidra | 856:4: :u
val_phaidra | http://phaidra.univie.ac.at/o:90495
phaidra_url | https://phaidra.univie.ac.at/o:90496
update_phaidra_url | https://phaidra.univie.ac.at/o:90496
o:90495 is a collection, o:90496 is the only member; this is a ZS record,
so this is ok and the suggested change is not correct.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment