From 37d62ee190fbcce26d4ee573a9ee497625484437 Mon Sep 17 00:00:00 2001 From: Gerhard Gonter <ggonter@gmail.com> Date: Mon, 26 Sep 2022 14:32:22 +0200 Subject: [PATCH] additional functionality for ut1.pl --- lib/DataCite/API.pm | 6 +- lib/Univie/Utheses/API.pm | 58 ++-- ut1.pl | 620 +++++++++++++++++++++++++++++++++----- 3 files changed, 591 insertions(+), 93 deletions(-) diff --git a/lib/DataCite/API.pm b/lib/DataCite/API.pm index 5601f17..ce759dd 100644 --- a/lib/DataCite/API.pm +++ b/lib/DataCite/API.pm @@ -52,7 +52,7 @@ sub register_doi unless ($code1 =~ m#^20[01]#) { print STDERR "ATTN: register_doi POST metadata returned code1=[$code1] res1=[$res1]\n"; - return undef; + return (0, $res1, undef); } my $doi_reg= <<"EOX"; @@ -66,10 +66,10 @@ EOX unless ($code2 =~ m#^20[01]#) { print STDERR "ATTN: register_doi POST doi returned code2=[$code2] res2=[$res2]\n"; - return undef; + return (0, $res1, $res2); } - 1; + (1, $res1, $res2); } sub datacite_request diff --git a/lib/Univie/Utheses/API.pm b/lib/Univie/Utheses/API.pm index 1b2e896..11f01a1 100644 --- a/lib/Univie/Utheses/API.pm +++ b/lib/Univie/Utheses/API.pm @@ -20,45 +20,65 @@ sub getContainerPublicMetadata my $self= shift; my $utheses_id= shift; - my ($code1, $res1)= $self->utheses_request('GET', 'container/get/public', $utheses_id); - print __LINE__, " code1=[$code1] res1=[$res1]\n"; - - my $info; - if ($code1 =~ m#^2#) - { - eval - { - $info= from_json($res1); - }; - if ($@) - { - die $@; - } - } + my ($code1, $res1, $info)= $self->utheses_request('GET', 'container/get/public', $utheses_id); + print __LINE__, " code1=[$code1] res1=[$res1] info=[$info]\n"; # TODO: Error handling! $info; } +sub getPendingDoisCreateRequest +{ + my $self= shift; + + my ($code1, $res1, $info)= $self->utheses_request('GET', 'doi/get/createRequest', undef, ); + print __LINE__, " code1=[$code1] res1=[$res1] info=[$info]\n"; + + # TODO: Error handling! + (wantarray) ? ($code1, $res1, $info) : $info; +} + sub utheses_request { my $self= shift; my $method= shift; my $what= shift; - my $id= shift; + my $par= shift; - my ($api_url)= map { $self->{config}->{$_} } qw(api_url); + my ($api_url, $headers)= map { $self->{config}->{$_} } qw(api_url headers); - my $req_url= join ('/', $api_url, $what, $id); + my $req_url= join ('/', $api_url, $what, $par); print __LINE__, " req_url=[$req_url]\n"; my $req = HTTP::Request->new( $method => $req_url ); + if (defined ($headers)) + { + foreach my $h (@$headers) + { + $req->header(@$h); + } + } my $ua= LWP::UserAgent->new; my $res= $ua->request($req); # print __LINE__, " res: ", main::Dumper($res); my $txt= decode("utf8", $res->content()); - return ($res->code(), $txt); + my $code= $res->code(); + + my $info; + if ($code =~ m#^2#) + { + eval + { + $info= from_json($txt); + }; + if ($@) + { + die $@; # TODO: this should be handled mor gracefully! + } + } + + return ($code, $txt, $info); } 1; diff --git a/ut1.pl b/ut1.pl index 71d6154..4cf8654 100755 --- a/ut1.pl +++ b/ut1.pl @@ -3,6 +3,7 @@ use strict; use FileHandle; +use utf8; binmode( STDOUT, ':utf8' ); autoflush STDOUT 1; binmode( STDERR, ':utf8' ); autoflush STDERR 1; @@ -11,21 +12,36 @@ binmode( STDIN, ':utf8' ); use Data::Dumper; $Data::Dumper::Indent= 1; +use Digest::MD5::File qw(file_md5_hex); + +use Util::ts; +use Util::JSON; +use Redmine::DB::MySQL; + +use Phaidra::Utils::iso639; + use lib 'lib'; use Univie::Utheses::API; -use Phaidra::Utils::iso639; -use Util::JSON; use DataCite::API; +use IRMA::NA; +use Alma::MARC_Extractor; my @TSV_COLUMNS= qw( utheses_id fulltext_locked suffix doi nbn ac_number langs language persistent_link xml_fnm errors ); # my $op_mode= 'fetch_metadata_bulk'; -my $op_mode= 'analyze'; +my $op_mode= 'process'; my $fnm_tsv= 'utheses/utheses_info.tsv'; # TODO: timestamp! -my $config_file= '/etc/irma/DataCite.json'; +# TODO: make these configurable too! +my $agent_config_file= '/etc/irma/pidagent.json'; + +my $MAX_MARC_AGE= 86400*60; +my $MAX_MARC_REQUESTS= 10_000; my $do_register_doi= 0; +my $agent_name= 'pidagent'; +my $agent_id= $$; +my $fix_problems= 0; my @pars= (); while (my $arg= shift (@ARGV)) @@ -36,6 +52,7 @@ while (my $arg= shift (@ARGV)) my ($opt, $val)= split ('=', $1, 2); if ($opt eq 'help') { usage(); } elsif ($opt eq 'register-doi') { $do_register_doi= 1; } + elsif ($opt eq 'fix') { $fix_problems= 1; } else { usage(); } } elsif ($arg =~ /^-(.+)/) @@ -53,25 +70,84 @@ while (my $arg= shift (@ARGV)) } } -my $cnf= Util::JSON::read_json_file ($config_file); -# print __LINE__, " cnf: ", main::Dumper ($cnf); exit(0); -my $ut_cnf= $cnf->{repositories}->{'utheses.univie.ac.at'}; +# load configuration +my $agent_cnf= Util::JSON::read_json_file ($agent_config_file); +# print __LINE__, " agent_cnf: ", main::Dumper ($agent_cnf); exit(0); +my $dc_cnf= Util::JSON::read_json_file ($agent_cnf->{DataCite_config}); +# print __LINE__, " dc_cnf: ", main::Dumper ($dc_cnf); exit(0); +my $ut_cnf= $dc_cnf->{repositories}->{'utheses.univie.ac.at'}; # print __LINE__, " ut_cnf: ", main::Dumper ($ut_cnf); exit(0); -my $reg_cnf= $cnf->{doi_registries}->{$ut_cnf->{registry}}; -# print __LINE__, " reg_cnf: ", main::Dumper ($reg_cnf); exit(0); +my $dc_reg_cnf= $dc_cnf->{doi_registries}->{my $dc_cnf_name= $ut_cnf->{registry}}; +# print __LINE__, " dc_reg_cnf: ", main::Dumper ($dc_reg_cnf); exit(0); + +# prepare APIs +my $ut_api= new Univie::Utheses::API( config => $ut_cnf->{api_config} ); +die "no ut_api" unless (defined ($ut_api)); - my $utapi= new Univie::Utheses::API( config => { api_url => $ut_cnf->{api_url} } ); - die "no utapi" unless (defined ($utapi)); +my $reg_obj= new DataCite::API (config => $dc_reg_cnf, xmode => 'test'); +die "no reg_obj" unless (defined ($reg_obj)); - my $reg_obj= new DataCite::API (config => $reg_cnf, xmode => 'test'); - die "no reg_obj" unless (defined ($reg_obj)); +# get handles for various databases +my $marc_db= IRMA::db::get_any_db($agent_cnf, 'marc_database'); +my $agent_db= IRMA::db::get_any_db($agent_cnf, 'pidagent_database'); +print __LINE__, " agent_db=[$agent_db]\n"; -if ($op_mode eq 'analyze') +my @marc_fields= qw(ac_number mms_id fetched lib_code); +my $mex= Alma::MARC_Extractor->new(\@marc_fields); + +my $running= 0; +if ($op_mode eq 'process') { - foreach my $par (@pars) + do { - if ($par =~ m#^\d+$#) { analyze_utheses_item($par); } - } + while (my $par= shift (@pars)) + { + if ($par =~ m#^(\d+)$# + || $par =~ m#https?://utheses.univie.ac.at/detail/(\d+)[\#/]?# + ) + { + my $utheses_id= $1; + my @actions= process_utheses_item($utheses_id); + print __LINE__, " process_utheses_item($utheses_id) ==> [", join(', ', @actions), "]\n"; + } + elsif ($par =~ m#^(\d+)\.\.(\d+)$# || $par =~ m#^(\d+)\-(\d+)$# || $par =~ m#(blk)(\d+)$#) + { + my ($start, $end)= ($1, $2); + ($start, $end)= ($end*100, $end*100+99) if ($start eq 'blk'); + my $item_count= $end-$start; + if ($start <= $end && (($item_count <= 2000 && $do_register_doi) || ($item_count <= 5000 && !$do_register_doi))) + { + foreach (my $utheses_id= $start; $utheses_id <= $end; $utheses_id++) + { + push (@pars, "$utheses_id"); # cast to string! + } + } + } + elsif ($par =~ m#^AC\d{8}$#) + { + my @actions= process_ac_number($par); + print __LINE__, " process_ac_number($par) ==> [", join(', ', @actions), "]\n"; + } + elsif ($par eq 'queue') + { + my @utheses_ids= get_job_from_queue(); + print __LINE__, ' queued utheses_ids: ', join(', ', @utheses_ids), "\n"; + sleep(10); + push (@pars, @utheses_ids, 'cleanup'); + } + elsif ($par eq 'cleanup') + { + cleanup_queue(); + } + elsif ($par eq 'gpdcr') + { + my @utheses_ids= gpdcr(); + # push (@pars, @utheses_ids); + } + + sleep(2) if (@pars); + } + } while($running); } elsif ($op_mode eq 'fetch_metadata_bulk') { @@ -79,55 +155,372 @@ elsif ($op_mode eq 'fetch_metadata_bulk') } exit(0); -sub analyze_utheses_item +sub gpdcr +{ + my ($status, $txt, $info)= $ut_api->getPendingDoisCreateRequest(); + print __LINE__, " gpdcr: info: ", Dumper($info); + + my @utheses_ids= (); + if ($status eq '200') + { + my $p= $info->{pendingDois}; + if (defined ($p) && ref($p) eq 'HASH') + { + push (@utheses_ids, map { $p->{$_}->{utheses_id} } keys %$p); + } + else + { + die "unexpected type of pendingDois"; + } + } + + @utheses_ids; +} + +sub cleanup_queue +{ + my $q_col= $agent_db->get_collection('queue'); + + my $j= $q_col->find_one({status => "in_progress", agent_name => $agent_name, agent_id => $agent_id}); + if (defined ($j)) + { + print __LINE__, " finishing old job ", Dumper($j); + $q_col->update({_id => $j->{_id}}, { '$set' => {status => "done", agent_name => $agent_name, agent_id => $agent_id }}); + sleep(10); + } +} + +sub get_job_from_queue +{ + my $q_col= $agent_db->get_collection('queue'); + + my $j= $q_col->find_one({status => "in_progress", agent_name => $agent_name}); + if (defined ($j)) + { + print __LINE__, " resuming old job ", Dumper($j); + } + else + { + $j= $q_col->find_one({status => "new"}); + } + return () unless (defined ($j)); + print __LINE__, " found new job ", Dumper($j); + + $q_col->update({_id => $j->{_id}}, { '$set' => {status => "in_progress", agent_name => $agent_name, agent_id => $agent_id }}); + + my @ut_ids; + if (exists($j->{utheses_ids})) { push (@ut_ids, @{$j->{utheses_ids}}); } + if (exists($j->{utheses_id})) { push (@ut_ids, $j->{utheses_id}); } + + return @ut_ids; +} + +sub process_utheses_item { my $utheses_id= shift; - my ($row, $xml)= get_utheses_metadata($utheses_id); + print __LINE__, " process_utheses_item: utheses_id=[$utheses_id] ", '='x50, "\n"; + + my $register_doi_ok= 1; + my $row= {}; + my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id); - print __LINE__, " utheses_id=[$utheses_id] ", '='x50, "\n"; + print __LINE__, " error_code=[$error_code] status=[$status]\n"; print __LINE__, " xml=[$xml]\n"; print __LINE__, " row: ", Dumper($row); - if ($do_register_doi) + my @actions= ('fetched_utheses'); + unless ($status eq '200') + { + push (@actions, 'no_utheses_record'); + return @actions; + } + + unless ($error_code eq 'ok') + { + push (@actions, "error_code=[$error_code]"); + report_problem( { area => 'utheses', problem => 'error_code', utheses_id => $utheses_id, error_code => $error_code } ); + $register_doi_ok= 0; + } + + my @x= analyze_marc_record($row); + push (@actions, @x); + + unless (defined ($agent_db)) + { + return @actions; + } + + my $ut_col= $agent_db->get_collection('utheses'); + + if ($fix_problems) { remove_problem('utheses', $row->{utheses_id}); } + # first: compare existing utheses record (if it exists) with data from utheses + my $ut_data= $ut_col->find_one( { utheses_id => $row->{utheses_id} } ); + if (defined ($ut_data)) + { + my @cmp_fields= qw(doi urn nbn ac_number); + my @problems= (); + foreach my $f (@cmp_fields) + { + if (exists($ut_data->{$f}) && defined($ut_data->{$f}) && $row->{$f} ne $ut_data->{$f}) + { + push (@problems, { problem => 'missmatch', field => $f, recorded => $ut_data->{$f}, found => $row->{$f} }); + } + } + + if (@problems) + { + report_problem( { area => 'utheses', problem => 'utheses_data_missmatch', utheses_id => $utheses_id, problems => \@problems } ); + push(@actions, 'problem report utheses'); + return (@actions); + } + } + + # second: update utheses record in database + my $res_upd= $ut_col->update( { utheses_id => $row->{utheses_id} }, $row, { upsert => 1 } ); + print __LINE__, " res_upd: ", Dumper($res_upd); + + my ($ok, $verdict, $msg1, $msg2); + if ($do_register_doi && $register_doi_ok) { - unless (defined ($row)) + if ($fix_problems) { - print "ATTN: can't register DOI: no utheses data found\n"; - goto END; + remove_problem('datacite', $row->{utheses_id}, 'utheses_id'); + remove_problem('datacite', $row->{doi}); } - unless (defined ($xml)) + my $dc_col= $agent_db->get_collection('datacite'); + + my ($utheses_id, $doi, $xml_md5, $url)= map { $row->{$_} } qw(utheses_id doi xml_md5 persistent_link); + my $dc_record= $dc_col->find_one( { doi => $doi } ); + + # prepare registration data + my %reg_info= + ( + doi => $row->{doi}, + url => $url, + context => 'utheses', + registry => $dc_cnf_name, + utheses_id => $utheses_id, + ac_number => $row->{ac_number}, + xml_md5 => $xml_md5, + ts_epoch => time(), + ts_iso_gmt => Util::ts::ts_ISO_gmt(), + ); + + # check existing datacite registration record, if it exists + if (defined ($dc_record)) { - print "ATTN: can't register DOI: no xml data generated\n"; - goto END; + print __LINE__, " datacite record found: ", Dumper($dc_record); + + # check if record matches with new data + my @cmp_fields= qw(registry context url utheses_id ac_number); + my @problems= (); + foreach my $f (@cmp_fields) + { + if (exists($dc_record->{$f}) && defined($dc_record->{$f})) + { + if ($reg_info{$f} ne $dc_record->{$f}) + { + push (@problems, { problem => 'missmatch', field => $f, recorded => $dc_record->{$f}, found => $reg_info{$f} }); + } + # else: record matches, that's ok + } + else + { + push (@problems, { problem => 'missing', field => $f }); + } + } + + if (@problems) + { + report_problem( { area => 'datacite', problem => 'datacite_data_missmatch', utheses_id => $utheses_id, doi => $doi, problems => \@problems } ); + push(@actions, 'problem report datacite'); + return (@actions); + } + + if ($dc_record->{xml_md5} eq $xml_md5 && $dc_record->{reg_status} == 1) + { + print __LINE__, " datacite metadata unchanged; not updating!\n"; + push (@actions, 'datacite doi metadata unchanged'); + return @actions; + } } - my ($doi, $url, $errors, $ftl)= map { $row->{$_} } qw(doi persistent_link datacite_conversion_errors fulltext_locked); - if ($ftl) + ($ok, $verdict, $msg1, $msg2)= register_doi_with_DataCite_for_utheses($row, $xml); + print __LINE__, " ok=[$ok] msg1=[$msg1] msg2=[$msg2] verdict=[$verdict]\n"; + push (@actions, (($ok) ? 'datacite doi registration ok' : 'datacite doi registration failed'), $verdict); + + $reg_info{reg_status}= $ok; + $reg_info{reg_verdict}= $verdict, + $reg_info{reg_message1}= $msg1; + $reg_info{reg_message2}= $msg2; + + print __LINE__, " insert into datacite; reg_info: ", Dumper(\%reg_info); + my $res= $dc_col->update( { doi => $doi }, \%reg_info, { upsert => 1 } ); + print __LINE__, " insert res: ", Dumper($res); + + unless ($ok) { - print "ATTN: can't register DOI: fulltext locked\n"; - goto END; + report_problem( { area => 'datacite', problem => 'registration failure', utheses_id => $utheses_id, doi => $doi, reg_info => \%reg_info } ); } + } + + @actions; +} + +sub process_ac_number +{ + my $ac_number= shift; - if (@$errors) + my $row= {}; + my @actions= analyze_marc_record($row, $ac_number); + + if (exists($row->{val_utheses})) + { + my $ut_link= $row->{val_utheses}; + if ($ut_link =~ m#https?://utheses.univie.ac.at/detail/(\d+)[\#/]?#) + { + my $utheses_id= $1; + my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id); + } + else { - print "ATTN: can't register DOI $doi due to errors: ", join(', ', @$errors), "\n"; - goto END; + # TODO: report bad link in Alma } + } + + print __LINE__, " process_ac_number: ac_number=[$ac_number] row: ", Dumper($row); +} + +sub analyze_marc_record +{ + my $row= shift; + my $ac_number= shift || $row->{ac_number}; + + return ($row->{marc_record}= 'no_marc_db') unless (defined ($marc_db)); + return ($row->{marc_record}= 'invalid_ac_number') unless ($ac_number =~ m#^AC\d{8}$#); + + my @actions= (); + my $marc_col= $marc_db->get_collection('alma.marc'); + my $marc_rec= $marc_col->find_one({ ac_number => $ac_number }); + print __LINE__, " marc_rec: ", Dumper($marc_rec); + my $request_marc_rec= 0; - if ($reg_obj->register_doi ($doi, $xml, $url)) + if (defined ($marc_rec)) + { + my $marc_fetched= $marc_rec->{fetched}; + + my $best_before= $marc_fetched + $MAX_MARC_AGE; + my $now= time(); + print __LINE__, " marc_fetched=[$marc_fetched] best_before=[$best_before] now=[$now]\n"; + + if ($best_before > $now) { - print "NOTE: register_doi doi=[$doi] url=[$url] OK\n"; + $row->{marc_record}= 'ok'; } else { - print "ATTN: register_doi doi=[$doi] url=[$url] was not ok\n"; + print __LINE__, " marc_record too old\n"; + $row->{marc_record}= 'too_old'; + $request_marc_rec= 1; } + + # check marc record, even when it is too old + $row->{marc}= my $x= {}; + $mex->extract_identifiers($marc_rec, $x); + push (@actions, 'marc_rec_checked'); + } + else + { + $row->{marc_record}= 'not_found'; + $request_marc_rec= 1; + } + + if ($request_marc_rec) + { + my $req_col= $marc_db->get_collection('requests'); + + my $req= + { + agent => 'alma_cat', + status => 'new', + action => 'update_alma_2xml', + ac_number => $ac_number, + requested_by => $agent_name, + }; + $req_col->insert ($req); + + push (@actions, 'marc_rec_requested'); + print __LINE__, " analyze_marc_record: insert requests: ", Dumper($req); + } + + @actions; +} + +sub register_doi_with_DataCite_for_utheses +{ + my $row= shift; + my $xml= shift; + + my ($rejected, $accepted); + my $ok= 0; + + unless (defined ($row)) + { + $rejected= "can't register DOI: no utheses data found"; + goto END; + } + + unless (defined ($xml)) + { + $rejected= "can't register DOI: no xml data generated"; + goto END; + } + + my ($doi, $url, $errors, $pol)= map { $row->{$_} } qw(doi persistent_link datacite_conversion_errors policies); + my ($ftl, $lks)= map { $pol->{$_} } qw(fulltext_locked lock_status); + if (!defined ($ftl) || $ftl) + { + $rejected= "can't register DOI: fulltext locked ($ftl)"; + goto END; + } + + if (!defined ($lks) || $lks) + { + $rejected= "can't register DOI: lock_status=[$lks]"; + goto END; + } + + if (@$errors) + { + $rejected= "can't register DOI $doi due to errors: ", join(', ', @$errors); + goto END; + } + + my ($reg_res, $reg_msg1, $reg_msg2)= $reg_obj->register_doi ($doi, $xml, $url); + if ($reg_res) + { + $accepted= "register_doi doi=[$doi] url=[$url] OK"; + $ok= 1; + } + else + { + $rejected= "register_doi doi=[$doi] url=[$url] was not ok"; } END: - return; + + if ($rejected) + { + print "REJECTED: ", $rejected, "\n"; + } + else + { + print "ACCEPTED: ", $accepted, "\n"; + } + + return ($ok, ($rejected || $accepted), $reg_msg1, $reg_msg2); } sub fetch_metadata_bulk @@ -142,54 +535,90 @@ sub fetch_metadata_bulk { next if ($utheses_id eq 'utheses_id'); # CSV column name... - my ($row, $xml)= get_utheses_metadata($utheses_id); + my $row; + my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id); if (defined ($row)) { print TSV join("\t", map { $row->{$_} } @TSV_COLUMNS), "\n"; - } } } +=head2 my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id) + +return DataCite_XML document for a given utheses_id and fill in $row +with important information + +=cut + sub get_utheses_metadata { + my $row= shift; my $utheses_id= shift; print __LINE__, " utheses_id=[$utheses_id]\n"; - my $info= $utapi->getContainerPublicMetadata($utheses_id); + my $info= $ut_api->getContainerPublicMetadata($utheses_id); print __LINE__, " info: ", Dumper($info); - my ($row, $xml); - - if ($info->{status} eq '200') + my $xml; + my $status= $info->{status}; + my $error_code= 'unknown'; + if ($status eq '200') { - ($row, $xml)= utheses2datacite($info, $utheses_id); + $xml= utheses2datacite($row, $info, $utheses_id); # print __LINE__, " row: ", Dumper($row); if (defined ($row->{doi})) { - $row->{xml_fnm}= my $xml_fnm= 'utheses/DataCite_XML/'. $row->{doi}. '.xml'; - # print __LINE__, " DataCite_XML=[$xml_fnm] xml=[$xml]\n"; - - if (defined ($xml) && defined ($row->{xml_fnm}) && !@{$row->{datacite_conversion_errors}}) + if (defined ($xml) && defined (my $xml_fnm= $row->{xml_fnm})) { - open (XML, '>:utf8', $row->{xml_fnm}) or die; # TODO: or do something else... + $row->{xml_fnm}= my $xml_fnm= 'utheses/DataCite_XML/'. $row->{doi}. '.xml'; + # print __LINE__, " DataCite_XML=[$xml_fnm] xml=[$xml]\n"; + open (XML, '>:utf8', $xml_fnm) or die "can't write to $xml_fnm"; # TODO: or do something else... print XML $xml; close (XML); + my $md5= file_md5_hex($xml_fnm); + print __LINE__, " xml_fnm=[$xml_fnm] md5=[$md5]\n"; + $row->{xml_md5}= $md5; + } + + if ($row->{datacite_conversion_error_count} == 0) + { + $error_code= 'ok'; } + else + { + printf ERRORS ("utheses_id=%d datacite conversion errors\n", $utheses_id); + $error_code= 'datacite_conversion_errors'; + } + } + else + { + printf ERRORS ("utheses_id=%d no doi defined\n", $utheses_id); + $error_code= 'no_doi_defined'; } } else { - printf ERRORS ("utheses_id=%d status=%s\n", $utheses_id, $info->{status}); + printf ERRORS ("utheses_id=%d bad status=%s\n", $utheses_id, $status); + $error_code= 'bad_status'; } - return ($row, $xml); + $row->{error_code}= $error_code; + return ($error_code, $status, $xml); } +=head2 my $xml= utheses2datacite ($row, $info, $utheses_id) + +Transcribe utheses $info for a given utheses_id into xml and return that. +$row is filled with information extracted from $info. + +=cut + sub utheses2datacite { + my $row= shift; my $info= shift; my $utheses_id= shift; @@ -204,6 +633,9 @@ sub utheses2datacite if ($doi =~ m#^10.25365/(thesis\.\d+)$#) { $suffix= $1; + # rewrite DOI, if necessary, e.g. for the test environment + $doi= join ('/', $ut_cnf->{use_prefix}, $suffix) if (exists($ut_cnf->{use_prefix})); + # $doi= join ('/', '10.493943', $suffix); # TEST: fake prefix } else { @@ -215,8 +647,7 @@ sub utheses2datacite push (@datacite_conversion_errors, "bad_url=[$persistent_link]") unless ($persistent_link =~ m#^https://utheses.univie.ac.at/detail/\d+/?$#); my ($titles, $abstracts, $publication_date, $langs, $policies)= map { $th->{$_} } qw(titles abstracts publication_date languages policies); - my $fulltext_locked= $policies->{fulltext_locked}; - + my $publication_year; if ($publication_date =~ m#^(\d{4})-?#) { @@ -249,6 +680,8 @@ sub utheses2datacite <creators> EOX + # $xml .= " <urxn>bla</urxn>\n"; # TEST: check what happens with invalid XML + foreach my $author (@$authors) { $xml .= << "EOX"; @@ -317,23 +750,24 @@ EOX </resource> EOX - my %row= - ( - utheses_id => $utheses_id, - suffix => $suffix, - doi => $doi, - nbn => $nbn, - ac_number => $ac_number, - langs => join(',', @$langs), - language => $main_language, - persistent_link => $persistent_link, - datacite_conversion_errors => \@datacite_conversion_errors, - fulltext_locked => $fulltext_locked, - phaidra => $phaidra, - policies => $policies, - ); - - (\%row, $xml); + $row->{utheses_fetched}= time(); + $row->{utheses_id}= $utheses_id; + $row->{suffix}= $suffix; + $row->{doi}= $doi; + $row->{nbn}= $nbn; + $row->{ac_number}= $ac_number; + $row->{langs}= join(',', @$langs); + $row->{language}= $main_language; + $row->{persistent_link }=$persistent_link; + $row->{datacite_conversion_errors}= \@datacite_conversion_errors; + $row->{datacite_conversion_error_count}= scalar @datacite_conversion_errors; + $row->{phaidra}= $phaidra; # ... mapped, see below + $row->{policies}= $policies; # ... mapped, see below + + # foreach my $an (qw(fulltext_locked lock_status lock_until_date)) { $row->{$an}= $policies->{$an} } + # foreach my $an (qw(container_pid container_status thesis_doc_pid thesis_doc_status)) { $row->{$an}= $phaidra->{$an} } + + $xml; } sub xml_escape @@ -347,3 +781,47 @@ sub xml_escape $s; } +sub report_problem +{ + my $problem= shift; + + my $ut_col= $agent_db->get_collection('problems'); + + $problem->{ts_iso_gmt}= Util::ts::ts_ISO_gmt(); + + my $area= $problem->{area}; + my $check_id; + if ($area eq 'utheses') { $check_id= 'utheses_id' } + elsif ($area eq 'marc') { $check_id= 'ac_number' } + elsif ($area eq 'datacite') { $check_id= 'doi' } + + $ut_col->update( { area => $area, $check_id => my $id= $problem->{$check_id} }, $problem, { upsert => 1 } ); # replace problem report, if one already exists + + print __LINE__, " ATTN: problem reported for $check_id=$id: ", Dumper($problem); +} + +sub remove_problem +{ + my $area= shift; + my $id= shift; + my $check_id= shift; + + my $ut_col= $agent_db->get_collection('problems'); + + unless (defined ($check_id)) + { + if ($area eq 'utheses') { $check_id= 'utheses_id' } + elsif ($area eq 'marc') { $check_id= 'ac_number' } + elsif ($area eq 'datacite') { $check_id= 'doi' } + } + + $ut_col->remove({ area => $area, $check_id => $id }); +} + +__END__ + +=head1 TODO + +* handle signals like SIGINT + + -- GitLab