Skip to content
Snippets Groups Projects
Commit 4d63f0ff authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

factored code out int Perl modele Univie::EoD::CrossReference

parent e566b09e
No related branches found
No related tags found
No related merge requests found
......@@ -29,16 +29,29 @@ use Util::MongoDB;
use lib 'lib';
use Alma::MARC_Extractor;
use IRMA::NA;
# use IRMA::NA;
use Univie::EoD::CrossReference;
my $agent_config_file= '/etc/irma/eodagent.json';
my $op_mode= 'complete';
my @PARS;
my @tsv_columns=
qw( pid verdict ownerId state model
ac_number alma_notes ac_number_note aleph_url
marc_record ts_fetched fetched ts_marc mms_id lib_code
ticket ticket_status vt
df_doi val_doi doi update_doi
df_urn val_urn urn update_urn
df_hdl val_hdl hdl update_hdl
df_phaidra val_phaidra phaidra_url update_phaidra_url
);
my (@pars, @ac_numbers, @pids);
my $arg;
while (defined ($arg= shift (@ARGV)))
{
if ($arg eq '-') { push (@PARS, '-'); }
elsif ($arg eq '--') { push (@PARS, @ARGV); @ARGV= (); }
if ($arg eq '-') { push (@pars, '-'); }
elsif ($arg eq '--') { push (@pars, @ARGV); @ARGV= (); }
elsif ($arg =~ /^--(.+)/)
{
my ($opt, $val)= split ('=', $1, 2);
......@@ -56,7 +69,9 @@ while (defined ($arg= shift (@ARGV)))
}
else
{
push (@PARS, $arg);
if ($arg =~ m#^[\w\d\-]+:\d+$#) { push (@pids, $arg) }
elsif ($arg =~ m#^AC\d{8}$#) { push (@ac_numbers, $arg) }
else { push (@pars, $arg); }
}
}
......@@ -65,236 +80,87 @@ print join (' ', __FILE__, __LINE__, 'caller=['. caller() . ']'), "\n";
my $agent_cnf= Util::JSON::read_json_file ($agent_config_file);
# print __LINE__, " agent_cnf: ", main::Dumper ($agent_cnf); exit(0);
# Step 1: get list of books from inventory
# get handles for various databases
my $inv_db= IRMA::db::get_any_db($agent_cnf, 'inventory_database');
my $foxml_col= $inv_db->get_collection('foxml.data');
# print __LINE__, " foxml_col: ", Dumper($foxml_col);
print __LINE__, " foxml_col=[$foxml_col]\n";
my %counters;
my @books_duplicate_ac_number;
my @books_problems;
my @books_ok;
my @foxml_columns= qw(ownerId state model ac_number aleph_url pid);
# my %foxml_columns= map { $_ => 1 } @foxml_columns;
my $search= { ownerId => 'ondemae7', state => 'Active', model => 'Book' };
my $crf= Univie::EoD::CrossReference->new( agent_cnf => $agent_cnf, counters => \%counters );
my %counters;
my $cur= $foxml_col->find( $search );
# print __LINE__, ' cur: ', Dumper($cur);
my %ac_numbers= ();
my %pids= ();
my @books= ();
while (my $rec= $cur->next())
my $count_objects= 0;
if (@ac_numbers)
{
# print __LINE__, " rec: ", Dumper($rec);
my %book= map { $_ => $rec->{$_} } @foxml_columns;
# print __LINE__, " book: ", Dumper(\%book);
$book{phaidra_url}= 'https://phaidra.univie.ac.at/'. $book{pid};
push (@{$ac_numbers{$book{ac_number}}} => \%book);
$pids{$book{pid}}= \%book;
$crf->get_book_by_ac_number(\@ac_numbers);
$count_objects += @ac_numbers;
}
push (@books, \%book);
if (@pids)
{
$crf->get_book_by_pid(\@pids);
$count_objects += @pids;
}
print __LINE__, " checking for duplicate ac_numbers\n";
foreach my $ac_number (keys %ac_numbers)
if (@pars)
{
my $x= $ac_numbers{$ac_number};
# print __LINE__, " ac_number=[$ac_number] pids=[", join(', ', map { $_->{pid} } @$x), "]\n";
if (@$x != 1)
{ # either this is a duplicate or a member of a collection (ZS) where the Alma record should point to the collection instead
print __LINE__, " duplicate_ac_number=[$ac_number] pids=[", join(', ', map { $_->{pid} } @$x), "]\n";
$counters{duplicate_ac_number}++;
# TODO: find out, why this is a duplicate, possibly do not mark the "canonical" version which should be identified in a ticket or so
# for now, just leave them out of further processing.
foreach my $book (@$x)
{
$book->{ac_number_note}= 'dup';
}
while (my $par= shift (@pars))
{
# TODO: ???
}
}
my $book_count= @books;
print __LINE__, " found $book_count books\n";
if ($count_objects == 0)
{
$crf->get_book_inventory();
}
# BEGIN step 2: check IRMA records
my $irma_db= IRMA::db::get_any_db($agent_cnf, 'irma_database');
my $irma_col= $irma_db->get_collection('irma.map');
$crf->flag_duplicate_ac_numbers();
process_book_list($crf->get_book_list());
# BEGIN step 3: check Alma records
my $marc_db= IRMA::db::get_any_db($agent_cnf, 'marc_database');
my $marc_col= $marc_db->get_collection('alma.marc');
my $req_col= $marc_db->get_collection('requests');
print __LINE__, " counters: ", Dumper(\%counters);
my @marc_mex_fields= qw(df_doi val_doi df_phaidra val_phaidra df_urn val_urn ts_marc);
my @marc_fields= qw(ac_number mms_id fetched lib_code);
my @marc_extra_fields= qw(marc_record ts_fetched);
my $mex= Alma::MARC_Extractor->new(\@marc_fields);
$mex->{mex_ot2ut}= 0; $mex->{mex_phaidra}= 1;
Util::JSON::write_json_file('eod_problems.json', \@books_problems);
my @books_duplicate_ac_number;
my @books_problems;
my @books_ok;
BOOK3: foreach my $book (@books)
write_tsv_file('eod_data.tsv', \@books_ok);
write_tsv_file('duplicate_eod_data.tsv', \@books_duplicate_ac_number);
write_tsv_file('problems_eod_data.tsv', \@books_problems);
exit(0);
sub process_book_list
{
my @irma_notes= ();
my ($ac_number, $pid)= map { $book->{$_} } qw(ac_number pid);
my $book_list= shift;
unless (defined ($ac_number) && $ac_number =~ m#^AC\d{8}$#)
{
$book->{problem}= "invalid ac_number=[$ac_number]";
push (@books_problems, $book);
$book->{problem}= 'invalid_ac_number';
$counters{invalid_ac_number}++;
next BOOK3;
}
my $book_count= @$book_list;
print __LINE__, " found $book_count books\n";
my @irma_records= $irma_col->find({ ac_number => $ac_number })->all();
print __LINE__, " ac_number=[$ac_number] irma_records: ", scalar @irma_records, ' ', Dumper(\@irma_records);
# BEGIN step 2: check IRMA records
my $marc= $marc_col->find_one({ ac_number => $ac_number });
# print __LINE__, " marc: ", Dumper($marc);
unless (defined ($marc))
{
$counters{missing_marc_record}++;
$book->{problem}= 'missing_marc_record';
push (@books_problems, $book);
next BOOK3;
}
# BEGIN step 3: check Alma records
# my $req_col= $marc_db->get_collection('requests');
if ($book->{ac_number_note})
{
$counters{has_ac_number_note}++;
push (@books_duplicate_ac_number, $book); # save for later, see above near the duplicate check
next BOOK3;
}
# my @marc_mex_fields= qw(df_doi val_doi df_phaidra val_phaidra df_urn val_urn ts_marc);
# my @marc_extra_fields= qw(marc_record ts_fetched);
my @alma_notes= ();
$book->{ts_fetched}= Util::ts::ts_ISO_gmt($marc->{fetched});
$mex->extract_identifiers2($marc, $book);
if (exists($book->{mex}))
BOOK3: foreach my $book (@$book_list)
{
my $mex= $book->{mex};
print __LINE__, " ac_number=[$ac_number] mex: ", Dumper($mex);
foreach my $mf (keys %$mex)
my ($verdict)= $crf->check_book($book);
$book->{verdict}= $verdict;
$counters{$verdict}++;
if ($verdict eq 'ok')
{
my $mfa= $mex->{$mf};
if (@$mfa != 1)
{ # this should be inspected
print __LINE__, " ac_number=[$ac_number] multiple entries for $mf mex: ", Dumper($mex);
$counters{multiple_mex_entries}++;
$book->{problems}= 'multiple_mex_entries';
push (@books_problems, $book);
next BOOK3;
}
foreach my $mfe (@$mfa)
{
my ($df, $val)= @$mfe;
my $copy= 0;
if ($mf eq 'phaidra')
{
if ($val =~ m#^https?://phaidra.univie.ac.at/(.*)(o:\d+)$#)
{
$copy= 1; # only transcribe those URLs that really look like some
my ($extra, $found_pid)= ($1, $2);
my $correct_url= 'https://phaidra.univie.ac.at/'. $found_pid;
if ($correct_url ne $book->{phaidra_url} || $extra ne '')
{
$book->{update_phaidra_url}= $book->{phaidra_url};
$counters{update_phaidra_url}++;
push(@alma_notes, 'update_phaidra_url');
}
}
}
elsif ($mf eq 'hdl')
{
if ($val =~ m#^11353/10\.(\d+)$#)
{
$copy= 1; # only transcribe those URLs that really look like some
my $found_pid= 'o:'.$1;
if ($found_pid ne $pid)
{ # TODO:
$book->{update_hdl}= $book->{hdl};
$counters{update_hdl}++;
push(@alma_notes, 'update_hdl');
}
}
}
elsif ($mf eq 'doi')
{
if ($val =~ m#^10.25365/digital-copy\.(\d+)$#)
{
$copy= 1; # only transcribe those URLs that really look like some
# TODO: DOI checken
# if ($found_pid ne $pid)
# { # TODO:
# $book->{update_hdl}= $book->{hdl};
# $counters{update_hdl}++;
# push(@alma_notes, 'update_hdl');
# }
}
}
elsif ($mf eq 'urn')
{
if ($val =~ m#^urn:nbn:at:at-ubw-\d{5}\.\d{5}\.\d{6}-\d$#)
{
$copy= 1; # only transcribe those URLs that really look like some
# TODO: check if this is the known nbn!
}
}
if ($copy)
{
$book->{'df_'.$mf}= $df;
$book->{'val_'.$mf}= $val;
}
else
{
push(@alma_notes, "junk_${mf}_in_marc");
print __LINE__, " ATTN: ac_number=[$ac_number] pid=[$pid] junk data mf=[$mf] in marc record: df=[$df] val=[$val]\n";
}
}
push (@books_ok, $book);
}
elsif ($verdict eq 'has_ac_number_note')
{
push (@books_duplicate_ac_number, $book);
}
else
{
push (@books_problems, $book);
}
}
else
{
# this is ok, not a bug, Alma simply does not know anything about this phaidra object
print __LINE__, " ac_number=[$ac_number] mex missing\n";
}
unless (exists($book->{df_phaidra}))
{ # TODO: if there is no phaidra_url at all, set it...
$counters{set_phaidra_url}++;
$book->{update_phaidra_url}= $book->{phaidra_url};
push(@alma_notes, 'set_phaidra_url');
}
push (@alma_notes, 'ok') unless (@alma_notes);
$book->{alma_notes}= join(',', @alma_notes);
push (@books_ok, $book);
# print __LINE__, " book: ", Dumper($book);
$counters{ok}++;
}
# END check Alma records
print __LINE__, " counters: ", Dumper(\%counters);
Util::JSON::write_json_file('eod_problems.json', \@books_problems);
my @tsv_columns=
qw( pid alma_notes ownerId state model ac_number ac_number_note aleph_url
marc_record ts_fetched fetched ts_marc mms_id lib_code
df_doi val_doi
df_urn val_urn
df_hdl val_hdl update_hdl
df_phaidra val_phaidra phaidra_url update_phaidra_url
);
write_tsv_file('eod_data.tsv', \@books_ok);
write_tsv_file('duplicate_eod_data.tsv', \@books_duplicate_ac_number);
sub write_tsv_file
{
......@@ -314,7 +180,32 @@ sub write_tsv_file
__END__
=head1 EXAMPLES
=head1 PROBLEMS
=head2 AC06947549
-[ RECORD 1 ]----------------------
pid | o:1024961
verdict | ok
ownerId | ondemae7
state | Active
model | Book
ac_number | AC06947549
alma_notes | update_hdl
aleph_url | https://ubdata.univie.ac.at/AC06947549
marc_record | marc_data_found
ts_fetched | 2023-03-18T172423
fetched | 1679160263
ts_marc | 20230210194745.0
mms_id | 990074078270203332
lib_code | 43ACC_UBW
df_hdl | 776:0:8:o
val_hdl | 11353/10.1027250
df_phaidra | 856:4:1:u
val_phaidra | https://phaidra.univie.ac.at/o:1024961
phaidra_url | https://phaidra.univie.ac.at/o:1024961
the handle value in 776:0:8:o is 11353/10.1027250 but it should be 11353/10.1024961
=head2 AC02901724
......@@ -338,6 +229,7 @@ phaidra_url | https://phaidra.univie.ac.at/o:90496
update_phaidra_url | https://phaidra.univie.ac.at/o:90496
o:90495 is a collection, o:90496 is the only member; this is a ZS record,
so this is ok and the suggested change is not correct.
so this is ok and the suggested change is not correct. Filtered for now
package Univie::EoD::CrossReference;
use strict;
use Data::Dumper;
use IRMA::db;
use Util::ts;
my @foxml_columns= qw(ownerId state model ac_number aleph_url pid);
my @marc_fields= qw(ac_number mms_id fetched lib_code);
sub new
{
my $class= shift;
my $self=
{
ac_numbers => {},
pids => {},
books => [],
};
bless ($self, $class);
$self->set(@_);
$self;
}
sub set
{
my $self= shift;
my %par= @_;
foreach my $par (keys %par)
{
$self->{$par}= $par{$par};
}
$self;
}
sub get_book_list
{
my $self= shift;
$self->{books};
}
sub get_db_col
{
my $self= shift;
my $db= shift;
my $col_name= shift;
return $self->{_db}->{$db}->{$col_name} if (defined ($self->{_db}->{$db}->{$col_name}));
my $inv_db= IRMA::db::get_any_db($self->{agent_cnf}, $db);
$self->{_db}->{$db}->{$col_name}= my $col= $inv_db->get_collection($col_name);
print __LINE__, " db=[$db] col_name=[$col_name] col=[$col]\n";
# print __LINE__, " col: ", Dumper($col);
$col;
}
sub get_book_inventory
{
my $self= shift;
my $search= { ownerId => $self->{agent_cnf}->{ownerId}, state => 'Active', model => 'Book' };
$self->find_books($search);
}
sub get_book_by_ac_number
{
my $self= shift;
my $ac_numbers= shift;
my $search= { ownerId => $self->{agent_cnf}->{ownerId}, state => 'Active', model => 'Book' };
_modify_search ($search, 'ac_number', $ac_numbers);
$self->find_books($search);
}
sub get_book_by_pid
{
my $self= shift;
my $pids= shift;
my $search= { ownerId => $self->{agent_cnf}->{ownerId}, state => 'Active', model => 'Book' };
_modify_search ($search, 'pid', $pids);
$self->find_books($search);
}
sub _modify_search
{
my $s= shift;
my $what= shift;
my $val= shift;
if (ref($val) eq 'ARRAY') { $s->{$what} = { '$in' => $val } }
else { $s->{$what}= $val; }
}
sub find_books
{
my $self= shift;
my $search= shift;
print __LINE__, " search: ", join(', ', %$search), "\n";
my $foxml_col= $self->get_db_col('inventory_database', 'foxml.data');
my $cur= $foxml_col->find( $search );
# print __LINE__, ' cur: ', Dumper($cur);
while (my $foxml_rec= $cur->next())
{
# print __LINE__, " foxml_rec=[$foxml_rec]\n";
$self->add_book($foxml_rec);
}
}
sub add_book
{
my $self= shift;
my $foxml_rec= shift;
# print __LINE__, " foxml_rec: ", Dumper($foxml_rec);
my %book= map { $_ => $foxml_rec->{$_} } @foxml_columns;
# print __LINE__, " book: ", Dumper(\%book);
$book{phaidra_url}= 'https://phaidra.univie.ac.at/'. $book{pid};
push (@{$self->{ac_numbers}->{$book{ac_number}}} => \%book);
$self->{pids}->{$book{pid}}= \%book;
push (@{$self->{books}}, \%book);
\%book;
}
sub flag_duplicate_ac_numbers
{
my $self= shift;
my $ac_numbers= $self->{ac_numbers};
print __LINE__, " checking for duplicate ac_numbers\n";
foreach my $ac_number (keys %$ac_numbers)
{
my $x= $ac_numbers->{$ac_number};
# print __LINE__, " ac_number=[$ac_number] pids=[", join(', ', map { $_->{pid} } @$x), "]\n";
if (@$x != 1)
{ # either this is a duplicate or a member of a collection (ZS) where the Alma record should point to the collection instead
print __LINE__, " duplicate_ac_number=[$ac_number] pids=[", join(', ', map { $_->{pid} } @$x), "]\n";
# TODO: find out, why this is a duplicate, possibly do not mark the "canonical" version which should be identified in a ticket or so
# for now, just leave them out of further processing.
foreach my $book (@$x)
{
$book->{ac_number_note}= 'dup';
}
}
}
}
sub check_book
{
my $self= shift;
my $book= shift;
my @irma_notes= ();
my ($ac_number, $pid, $ac_number_note)= map { $book->{$_} } qw(ac_number pid ac_number_note);
unless (defined ($ac_number) && $ac_number =~ m#^AC\d{8}$#)
{
$book->{problem}= "invalid ac_number=[$ac_number]";
$book->{problem}=
return ('invalid_ac_number');
}
return 'ac_number_note' if ($ac_number_note ne '');
return 'filtered_ac_numbers' if (exists ($self->{agent_cnf}->{filtered_ac_numbers}->{$ac_number}));
return 'filtered_pids' if (exists ($self->{agent_cnf}->{filtered_pids}->{$pid}));
# check tickets
my $ticket_col= $self->get_db_col('irma_database', 'eod.tickets');
my @tickets= $ticket_col->find({ ac_number => $ac_number })->all();
print __LINE__, " ac_number=[$ac_number] tickets: ", scalar @tickets, "\n";
# print __LINE__, Dumper(\@tickets);
if (@tickets > 1)
{
$book->{ticket}= join(',', map { $_->{ticket} } @tickets);
return 'multiple_tickets';
}
elsif (@tickets == 1)
{
my $t0= @tickets[0];
$book->{ticket}= $t0->{ticket};
$book->{ticket_status}= $t0->{status};
$book->{vt}= $t0->{custom_fields}->{vt}->[1];
}
# check IRMA for registered identifiers
my $irma_col= $self->get_db_col('irma_database', 'irma.map');
my @irma_records= $irma_col->find({ ac_number => $ac_number })->all();
# my @irma_records= $irma_col->find({ pid => $pid })->all(); # NOTE: there is no field named "pid" in the irma record!
print __LINE__, " ac_number=[$ac_number] irma_records: ", scalar @irma_records, "\n";
# print __LINE__, Dumper(\@irma_records);
if (@irma_records > 1) { return 'multiple_irma_records' }
elsif (@irma_records == 1)
{
my $ir0= $irma_records[0];
foreach my $field (qw(hdl urn doi))
{
$book->{$field}= $ir0->{$field} if (exists ($ir0->{$field}));
}
}
elsif (@irma_records == 0)
{
print __LINE__, " ATTN: ac_number=[$ac_number] no irma_record!\n";
$self->{counters}->{missing_irma_record}++;
}
my $marc_col= $self->get_db_col('marc_database', 'alma.marc');
my $marc= $marc_col->find_one({ ac_number => $ac_number });
# print __LINE__, " marc: ", Dumper($marc);
return 'missing_marc_record' unless (defined ($marc));
my @alma_notes= ();
$book->{ts_fetched}= Util::ts::ts_ISO_gmt($marc->{fetched});
my $mex= Alma::MARC_Extractor->new(\@marc_fields);
$mex->{mex_ot2ut}= 0; $mex->{mex_phaidra}= 1;
$mex->extract_identifiers2($marc, $book);
if (exists($book->{mex}))
{
my $mex= $book->{mex};
print __LINE__, " ac_number=[$ac_number] mex: ", Dumper($mex);
foreach my $mf (keys %$mex)
{
my $mfa= $mex->{$mf};
if (@$mfa != 1)
{ # this should be inspected
print __LINE__, " ATTN: ac_number=[$ac_number] multiple mex entries for $mf; mex: ", Dumper($mex);
return 'multiple_mex_entries';
}
foreach my $mfe (@$mfa)
{
my ($df, $val)= @$mfe;
my $copy= 0;
if ($mf eq 'phaidra')
{
if ($val =~ m#^https?://phaidra.univie.ac.at/(.*)(o:\d+)$#)
{
$copy= 1; # only transcribe those URLs that really look like some
my ($extra, $found_pid)= ($1, $2);
my $correct_url= 'https://phaidra.univie.ac.at/'. $found_pid;
if ($correct_url ne $book->{phaidra_url} || $extra ne '')
{
$book->{update_phaidra_url}= $book->{phaidra_url};
$self->{counters}->{update_phaidra_url}++;
push(@alma_notes, 'update_phaidra_url');
}
}
}
elsif ($mf eq 'hdl')
{
if ($val =~ m#^11353/10\.(\d+)$#)
{
$copy= 1; # only transcribe those URLs that really look like some
if ($val ne $book->{hdl})
{ # TODO:
$book->{update_hdl}= $book->{hdl};
$self->{counters}->{update_hdl}++;
push(@alma_notes, 'update_hdl');
}
}
}
elsif ($mf eq 'doi')
{
if ($val =~ m#^10.25365/digital-copy\.(\d+)$#)
{
$copy= 1; # only transcribe those URLs that really look like some
# TODO: DOI checken
if ($val ne $book->{doi})
{ # TODO:
$book->{update_doi}= $book->{doi};
$self->{counters}->{update_doi}++;
push(@alma_notes, 'update_doi');
}
}
else
{
print __LINE__, " ATTN: unknown DOI format in marc record: df=[$df] val=[$val]\n";
$self->{counters}->{bad_doi}++;
}
}
elsif ($mf eq 'urn')
{
if ($val =~ m#^urn:nbn:at:at-ubw-\d{5}\.\d{5}\.\d{6}-\d$#)
{
$copy= 1; # only transcribe those URLs that really look like some
# TODO: check if this is the known nbn!
if ($val ne $book->{urn})
{ # TODO:
$book->{update_urn}= $book->{urn};
$self->{counters}->{update_urn}++;
push(@alma_notes, 'update_urn');
}
}
else
{
print __LINE__, " ATTN: unknown NBN (urn) format in marc record: df=[$df] val=[$val]\n";
$self->{counters}->{bad_urn}++;
}
}
if ($copy)
{
$book->{'df_'.$mf}= $df;
$book->{'val_'.$mf}= $val;
}
else
{
push(@alma_notes, "junk_${mf}_in_marc");
print __LINE__, " ATTN: ac_number=[$ac_number] pid=[$pid] junk data mf=[$mf] in marc record: df=[$df] val=[$val]\n";
}
}
}
}
else
{
# this is ok, not a bug, Alma simply does not know anything about this phaidra object
print __LINE__, " ac_number=[$ac_number] mex missing\n";
}
if (!exists($book->{val_urn}) && exists ($book->{urn}))
{
$book->{update_urn}= $book->{urn};
$self->{counters}->{set_urn}++;
push(@alma_notes, 'set_urn');
}
unless (exists($book->{df_phaidra}))
{ # TODO: if there is no phaidra_url at all, set it...
$self->{counters}->{set_phaidra_url}++;
$book->{update_phaidra_url}= $book->{phaidra_url};
push(@alma_notes, 'set_phaidra_url');
}
push (@alma_notes, 'ok') unless (@alma_notes);
$book->{alma_notes}= join(',', @alma_notes);
return 'ok';
}
=head1 INTERNAL? METHODS
=cut
1;
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment