initial version of eod catalog checker

3059a218 · Gerhard Gonter · 37d62ee1 · 3059a218
Commit 3059a218 authored 2 years ago by Gerhard Gonter
--- a/eod1.pl
+++ b/eod1.pl
+#!/usr/bin/perl
+
+=head1 NAME
+
+  eod1.pl
+
+=head1 DESCRIPTION
+
+=cut
+
+use strict;
+
+use Data::Dumper;
+$Data::Dumper::Indent= 1;
+$Data::Dumper::Sortkeys= 1;
+
+use FileHandle;
+
+use utf8;
+binmode( STDIN,  ':utf8' );
+binmode( STDOUT, ':utf8' ); autoflush STDOUT 1;
+binmode( STDERR, ':utf8' ); autoflush STDERR 1;
+
+use Util::ts;
+use Util::JSON;
+use MongoDB;
+use Util::MongoDB;
+
+use lib 'lib';
+
+use Alma::MARC_Extractor;
+use IRMA::NA;
+
+my $agent_config_file= '/etc/irma/eodagent.json';
+
+my @PARS;
+my $arg;
+while (defined ($arg= shift (@ARGV)))
+{
+     if ($arg eq '-')  { push (@PARS, '-'); }
+  elsif ($arg eq '--') { push (@PARS, @ARGV); @ARGV= (); }
+  elsif ($arg =~ /^--(.+)/)
+  {
+    my ($opt, $val)= split ('=', $1, 2); 
+    if ($opt eq 'help') { usage(); }
+    else { usage(); }
+  }
+  elsif ($arg =~ /^-(.+)/)
+  {
+    foreach my $opt (split ('', $1))
+    {   
+         if ($opt eq 'h') { usage(); exit (0); }
+      # elsif ($opt eq 'x') { $x_flag= 1; }
+      else { usage(); }
+    }   
+  }
+  else
+  {
+    push (@PARS, $arg);
+  }
+}
+
+print join (' ', __FILE__, __LINE__, 'caller=['. caller() . ']'), "\n";
+
+my $agent_cnf= Util::JSON::read_json_file ($agent_config_file);
+# print __LINE__, " agent_cnf: ", main::Dumper ($agent_cnf); exit(0);
+
+# Step 1: get list of books from inventory
+# get handles for various databases
+my $inv_db=  IRMA::db::get_any_db($agent_cnf, 'inventory_database');
+my $foxml_col= $inv_db->get_collection('foxml.data');
+# print __LINE__, " foxml_col: ", Dumper($foxml_col);
+print __LINE__, " foxml_col=[$foxml_col]\n";
+
+my @foxml_columns= qw(ownerId state model ac_number aleph_url pid);
+# my %foxml_columns= map { $_ => 1 } @foxml_columns;
+my $search= { ownerId => 'ondemae7', state => 'Active', model => 'Book' };
+
+my %counters;
+my $cur= $foxml_col->find( $search );
+# print __LINE__, ' cur: ', Dumper($cur);
+my %ac_numbers= ();
+my %pids= ();
+my @books= ();
+while (my $rec= $cur->next())
+{
+  # print __LINE__, " rec: ", Dumper($rec);
+  my %book= map { $_ => $rec->{$_} } @foxml_columns;
+  # print __LINE__, " book: ", Dumper(\%book);
+
+  $book{phaidra_url}= 'https://phaidra.univie.ac.at/'. $book{pid};
+  push (@{$ac_numbers{$book{ac_number}}} => \%book);
+  $pids{$book{pid}}= \%book;
+
+  push (@books, \%book);
+}
+
+print __LINE__, " checking for duplicate ac_numbers\n";
+foreach my $ac_number (keys %ac_numbers)
+{
+  my $x= $ac_numbers{$ac_number};
+  # print __LINE__, " ac_number=[$ac_number] pids=[", join(', ', map { $_->{pid} } @$x), "]\n";
+  if (@$x != 1)
+  { # either this is a duplicate or a member of a collection (ZS) where the Alma record should point to the collection instead
+    print __LINE__, " duplicate_ac_number=[$ac_number] pids=[", join(', ', map { $_->{pid} } @$x), "]\n";
+    $counters{duplicate_ac_number}++;
+
+    # TODO: find out, why this is a duplicate, possibly do not mark the "canonical" version which should be identified in a ticket or so
+    # for now, just leave them out of further processing.
+    foreach my $book (@$x)
+    {
+      $book->{ac_number_note}= 'dup';
+    }
+  }
+}
+
+my $book_count= @books;
+print __LINE__, " found $book_count books\n";
+
+# BEGIN step 2: check IRMA records
+my $irma_db=  IRMA::db::get_any_db($agent_cnf, 'irma_database');
+my $irma_col= $irma_db->get_collection('irma.map');
+
+# BEGIN step 3: check Alma records
+my $marc_db=  IRMA::db::get_any_db($agent_cnf, 'marc_database');
+my $marc_col= $marc_db->get_collection('alma.marc');
+my $req_col= $marc_db->get_collection('requests');
+
+  my @marc_mex_fields= qw(df_doi val_doi df_phaidra val_phaidra df_urn val_urn ts_marc);
+  my @marc_fields= qw(ac_number mms_id fetched lib_code);
+  my @marc_extra_fields= qw(marc_record ts_fetched);
+  my $mex= Alma::MARC_Extractor->new(\@marc_fields);
+  $mex->{mex_ot2ut}= 0; $mex->{mex_phaidra}= 1;
+
+my @books_duplicate_ac_number;
+my @books_problems;
+my @books_ok;
+BOOK3: foreach my $book (@books)
+{
+  my @irma_notes= ();
+  my ($ac_number, $pid)= map { $book->{$_} } qw(ac_number pid);
+
+  unless (defined ($ac_number) && $ac_number =~ m#^AC\d{8}$#)
+  {
+    $book->{problem}= "invalid ac_number=[$ac_number]";
+    push (@books_problems, $book);
+    $book->{problem}= 'invalid_ac_number';
+    $counters{invalid_ac_number}++;
+    next BOOK3;
+  }
+
+  my @irma_records= $irma_col->find({ ac_number => $ac_number })->all();
+  print __LINE__, " ac_number=[$ac_number] irma_records: ", scalar @irma_records, ' ', Dumper(\@irma_records);
+
+  my $marc= $marc_col->find_one({ ac_number => $ac_number });
+  # print __LINE__, " marc: ", Dumper($marc);
+  unless (defined ($marc))
+  {
+    $counters{missing_marc_record}++;
+    $book->{problem}= 'missing_marc_record';
+    push (@books_problems, $book);
+    next BOOK3;
+  }
+
+  if ($book->{ac_number_note})
+  {
+    $counters{has_ac_number_note}++;
+    push (@books_duplicate_ac_number, $book); # save for later, see above near the duplicate check
+    next BOOK3;
+  }
+
+  my @alma_notes= ();
+  $book->{ts_fetched}= Util::ts::ts_ISO_gmt($marc->{fetched});
+  $mex->extract_identifiers2($marc, $book);
+  if (exists($book->{mex}))
+  {
+    my $mex= $book->{mex};
+    print __LINE__, " ac_number=[$ac_number] mex: ", Dumper($mex);
+    foreach my $mf (keys %$mex)
+    {
+      my $mfa= $mex->{$mf};
+
+      if (@$mfa != 1)
+      { # this should be inspected
+        print __LINE__, " ac_number=[$ac_number] multiple entries for $mf mex: ", Dumper($mex);
+        $counters{multiple_mex_entries}++;
+        $book->{problems}= 'multiple_mex_entries';
+        push (@books_problems, $book);
+        next BOOK3;
+      }
+
+      foreach my $mfe (@$mfa)
+      {
+        my ($df, $val)= @$mfe;
+        my $copy= 0;
+        if ($mf eq 'phaidra')
+        {
+          if ($val =~ m#^https?://phaidra.univie.ac.at/(.*)(o:\d+)$#)
+          {
+            $copy= 1; # only transcribe those URLs that really look like some
+            my ($extra, $found_pid)= ($1, $2);
+            my $correct_url= 'https://phaidra.univie.ac.at/'. $found_pid;
+            if ($correct_url ne $book->{phaidra_url} || $extra ne '')
+            {
+              $book->{update_phaidra_url}= $book->{phaidra_url};
+              $counters{update_phaidra_url}++;
+              push(@alma_notes, 'update_phaidra_url');
+					  }
+          }
+        }
+        elsif ($mf eq 'hdl')
+        {
+          if ($val =~ m#^11353/10\.(\d+)$#)
+          {
+            $copy= 1; # only transcribe those URLs that really look like some
+            my $found_pid= 'o:'.$1;
+            if ($found_pid ne $pid)
+            { # TODO:
+              $book->{update_hdl}= $book->{hdl};
+              $counters{update_hdl}++;
+              push(@alma_notes, 'update_hdl');
+            }
+          }
+        }
+        elsif ($mf eq 'doi')
+        {
+          if ($val =~ m#^10.25365/digital-copy\.(\d+)$#)
+          {
+            $copy= 1; # only transcribe those URLs that really look like some
+# TODO: DOI checken
+#           if ($found_pid ne $pid)
+#           { # TODO:
+#             $book->{update_hdl}= $book->{hdl};
+#             $counters{update_hdl}++;
+#             push(@alma_notes, 'update_hdl');
+#           }
+          }
+        }
+        elsif ($mf eq 'urn')
+        {
+          if ($val =~ m#^urn:nbn:at:at-ubw-\d{5}\.\d{5}\.\d{6}-\d$#)
+          {
+            $copy= 1; # only transcribe those URLs that really look like some
+            # TODO: check if this is the known nbn!
+          }
+        }
+
+
+        if ($copy)
+        {
+          $book->{'df_'.$mf}= $df;
+          $book->{'val_'.$mf}= $val;
+        }
+        else
+        {
+          push(@alma_notes, "junk_${mf}_in_marc");
+          print __LINE__, " ATTN: ac_number=[$ac_number] pid=[$pid] junk data mf=[$mf] in marc record: df=[$df] val=[$val]\n";
+        }
+      }
+    }
+  }
+  else
+  {
+    # this is ok, not a bug, Alma simply does not know anything about this phaidra object
+    print __LINE__, " ac_number=[$ac_number] mex missing\n";
+  }
+
+  unless (exists($book->{df_phaidra}))
+  { # TODO: if there is no phaidra_url at all, set it...
+    $counters{set_phaidra_url}++;
+    $book->{update_phaidra_url}= $book->{phaidra_url};
+    push(@alma_notes, 'set_phaidra_url');
+  }
+
+  push (@alma_notes, 'ok') unless (@alma_notes);
+  $book->{alma_notes}= join(',', @alma_notes);
+  push (@books_ok, $book);
+  # print __LINE__, " book: ", Dumper($book);
+  $counters{ok}++;
+}
+# END check Alma records
+
+print __LINE__, " counters: ", Dumper(\%counters);
+
+Util::JSON::write_json_file('eod_problems.json', \@books_problems);
+
+my @tsv_columns=
+qw( pid alma_notes ownerId state model ac_number ac_number_note aleph_url
+    marc_record ts_fetched fetched ts_marc mms_id lib_code
+    df_doi val_doi
+    df_urn val_urn
+    df_hdl val_hdl update_hdl
+    df_phaidra val_phaidra phaidra_url update_phaidra_url
+    );
+
+write_tsv_file('eod_data.tsv', \@books_ok);
+write_tsv_file('duplicate_eod_data.tsv', \@books_duplicate_ac_number);
+
+sub write_tsv_file
+{
+  my $tsv_filename= shift;
+  my $book_list= shift;
+
+  open (TSV, '>:utf8', $tsv_filename) or die "can't write to $tsv_filename";
+  my $count_books_ok= @$book_list;
+  print "saving $count_books_ok to $tsv_filename\n";
+  print TSV join("\t", @tsv_columns), "\n";
+  BOOKx: foreach my $book (@$book_list)
+  {
+    print TSV join("\t", map { $book->{$_} } @tsv_columns), "\n";
+  }
+  close (TSV);
+}
+
+__END__
+
+=head1 EXAMPLES
+
+=head2 AC02901724
+
+-[ RECORD 1 ]----------------------
+pid                | o:90496
+alma_notes         | update_phaidra_url
+ownerId            | ondemae7
+state              | Active
+model              | Book
+ac_number          | AC02901724
+aleph_url          | http://aleph.univie.ac.at/F?func=find-c&ccl_term=AC02901724
+marc_record        | marc_data_found
+ts_fetched         | 2023-03-18T173800
+fetched            | 1679161080
+ts_marc            | 20230303140010.0
+mms_id             | 990013785100203332
+lib_code           | 43ACC_UBW
+df_phaidra         | 856:4: :u
+val_phaidra        | http://phaidra.univie.ac.at/o:90495
+phaidra_url        | https://phaidra.univie.ac.at/o:90496
+update_phaidra_url | https://phaidra.univie.ac.at/o:90496
+
+o:90495 is a collection, o:90496 is the only member; this is a ZS record,
+so this is ok and the suggested change is not correct.
+
+