Skip to content
Snippets Groups Projects
Commit a5c87c6c authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

added op_mode to analyze utheses object and optionally register DOI with DataCite

parent b5f2bd13
No related branches found
No related tags found
No related merge requests found
...@@ -14,19 +14,38 @@ $Data::Dumper::Indent= 1; ...@@ -14,19 +14,38 @@ $Data::Dumper::Indent= 1;
use lib 'lib'; use lib 'lib';
use Univie::Utheses::API; use Univie::Utheses::API;
use Phaidra::Utils::iso639; use Phaidra::Utils::iso639;
use Util::JSON;
use DataCite::API;
# my %LANGUAGES= ( 'deu' => 'de', 'eng' => 'en' ); not used, see Phaidra::Utils::iso639 my @TSV_COLUMNS= qw( utheses_id fulltext_locked suffix doi nbn ac_number langs language persistent_link xml_fnm errors );
my @TSV_COLUMNS= qw( utheses_id suffix doi nbn ac_number langs language persistent_link errors ); # my $op_mode= 'fetch_metadata_bulk';
my $op_mode= 'analyze';
my $op_mode= 'fetch_metadata';
my $fnm_tsv= 'utheses/utheses_info.tsv'; # TODO: timestamp! my $fnm_tsv= 'utheses/utheses_info.tsv'; # TODO: timestamp!
my $config_file= '/etc/irma/DataCite.json';
my $do_register_doi= 0;
my @pars= (); my @pars= ();
while (my $arg= shift (@ARGV)) while (my $arg= shift (@ARGV))
{ {
if ($arg =~ /^-/) if ($arg eq '--') { push (@pars, @ARGV); @ARGV= (); }
elsif ($arg =~ /^--(.+)/)
{ {
my ($opt, $val)= split ('=', $1, 2);
if ($opt eq 'help') { usage(); }
elsif ($opt eq 'register-doi') { $do_register_doi= 1; }
else { usage(); }
}
elsif ($arg =~ /^-(.+)/)
{
foreach my $opt (split ('', $1))
{
if ($opt eq 'h') { usage(); exit (0); }
# elsif ($opt eq 'x') { $x_flag= 1; }
else { usage("unknown option $arg"); }
}
} }
else else
{ {
...@@ -34,55 +53,150 @@ while (my $arg= shift (@ARGV)) ...@@ -34,55 +53,150 @@ while (my $arg= shift (@ARGV))
} }
} }
if ($op_mode eq 'fetch_metadata') my $cnf= Util::JSON::read_json_file ($config_file);
# print __LINE__, " cnf: ", main::Dumper ($cnf); exit(0);
my $ut_cnf= $cnf->{repositories}->{'utheses.univie.ac.at'};
# print __LINE__, " ut_cnf: ", main::Dumper ($ut_cnf); exit(0);
my $reg_cnf= $cnf->{doi_registries}->{$ut_cnf->{registry}};
# print __LINE__, " reg_cnf: ", main::Dumper ($reg_cnf); exit(0);
my $utapi= new Univie::Utheses::API( config => { api_url => $ut_cnf->{api_url} } );
die "no utapi" unless (defined ($utapi));
my $reg_obj= new DataCite::API (config => $reg_cnf, xmode => 'test');
die "no reg_obj" unless (defined ($reg_obj));
if ($op_mode eq 'analyze')
{
foreach my $par (@pars)
{
if ($par =~ m#^\d+$#) { analyze_utheses_item($par); }
}
}
elsif ($op_mode eq 'fetch_metadata_bulk')
{ {
fetch_metadata(\@pars); fetch_metadata_bulk(\@pars);
} }
exit(0); exit(0);
sub fetch_metadata sub analyze_utheses_item
{ {
my $utheses_ids= shift; my $utheses_id= shift;
my ($row, $xml)= get_utheses_metadata($utheses_id);
my $utapi= new Univie::Utheses::API( config => { api_url => 'https://utheses-api-utheses-prod.cprod.univie.ac.at' } ); print __LINE__, " utheses_id=[$utheses_id] ", '='x50, "\n";
print __LINE__, " xml=[$xml]\n";
print __LINE__, " row: ", Dumper($row);
if ($do_register_doi)
{
unless (defined ($row))
{
print "ATTN: can't register DOI: no utheses data found\n";
goto END;
}
unless (defined ($xml))
{
print "ATTN: can't register DOI: no xml data generated\n";
goto END;
}
my ($doi, $url, $errors, $ftl)= map { $row->{$_} } qw(doi persistent_link datacite_conversion_errors fulltext_locked);
if ($ftl)
{
print "ATTN: can't register DOI: fulltext locked\n";
goto END;
}
if (@$errors)
{
print "ATTN: can't register DOI $doi due to errors: ", join(', ', @$errors), "\n";
goto END;
}
if ($reg_obj->register_doi ($doi, $xml, $url))
{
print "NOTE: register_doi doi=[$doi] url=[$url] OK\n";
}
else
{
print "ATTN: register_doi doi=[$doi] url=[$url] was not ok\n";
}
}
END:
return;
}
sub fetch_metadata_bulk
{
my $utheses_ids= shift;
open (TSV, '>:utf8', $fnm_tsv) or die; open (TSV, '>:utf8', $fnm_tsv) or die;
print TSV join("\t", @TSV_COLUMNS), "\n"; print TSV join("\t", @TSV_COLUMNS), "\n";
open (ERRORS, '>>:utf8', '@errors.lst') or die;
foreach my $utheses_id (@$utheses_ids) foreach my $utheses_id (@$utheses_ids)
{ {
next if ($utheses_id eq 'utheses_id'); # CSV column name... next if ($utheses_id eq 'utheses_id'); # CSV column name...
print __LINE__, " utheses_id=[$utheses_id]\n"; my ($row, $xml)= get_utheses_metadata($utheses_id);
my $info= $utapi->getContainerPublicMetadata($utheses_id);
print __LINE__, " info: ", Dumper($info);
if ($info->{status} eq '200' && exists ($info->{metadata}->{thesis}->{doi})) if (defined ($row))
{ {
my ($row, $xml)= utheses2datacite($info->{metadata}, $utheses_id);
print __LINE__, " row: ", Dumper($row);
print __LINE__, " xml=[$xml]\n";
print TSV join("\t", map { $row->{$_} } @TSV_COLUMNS), "\n"; print TSV join("\t", map { $row->{$_} } @TSV_COLUMNS), "\n";
if (defined ($row->{doi}))
}
}
}
sub get_utheses_metadata
{
my $utheses_id= shift;
print __LINE__, " utheses_id=[$utheses_id]\n";
my $info= $utapi->getContainerPublicMetadata($utheses_id);
print __LINE__, " info: ", Dumper($info);
my ($row, $xml);
if ($info->{status} eq '200')
{
($row, $xml)= utheses2datacite($info, $utheses_id);
# print __LINE__, " row: ", Dumper($row);
if (defined ($row->{doi}))
{
$row->{xml_fnm}= my $xml_fnm= 'utheses/DataCite_XML/'. $row->{doi}. '.xml';
# print __LINE__, " DataCite_XML=[$xml_fnm] xml=[$xml]\n";
if (defined ($xml) && defined ($row->{xml_fnm}) && !@{$row->{datacite_conversion_errors}})
{ {
my $xml_fnm= 'utheses/DataCite_XML/'.$row->{doi}.'.xml'; open (XML, '>:utf8', $row->{xml_fnm}) or die; # TODO: or do something else...
open (XML, '>:utf8', $xml_fnm) or die;
print XML $xml; print XML $xml;
close (XML); close (XML);
} }
} }
} }
else
{
printf ERRORS ("utheses_id=%d status=%s\n", $utheses_id, $info->{status});
}
return ($row, $xml);
} }
sub utheses2datacite sub utheses2datacite
{ {
my $md= shift; my $info= shift;
my $utheses_id= shift; my $utheses_id= shift;
my @errors; my @datacite_conversion_errors;
my ($th, $authors)= map { $md->{$_} } qw(thesis authors); my ($md)= map { $info->{$_} } qw(metadata);
my ($th, $authors, $phaidra)= map { $md->{$_} } qw(thesis authors phaidra);
my ($doi, $nbn, $ac_number, $persistent_link)= map { $th->{$_} } qw(doi urn ac_number persistent_link); my ($doi, $nbn, $ac_number, $persistent_link)= map { $th->{$_} } qw(doi urn ac_number persistent_link);
# TODO: check doi, nbn, ac_number for syntactic validity # TODO: check doi, nbn, ac_number for syntactic validity
...@@ -93,17 +207,39 @@ sub utheses2datacite ...@@ -93,17 +207,39 @@ sub utheses2datacite
} }
else else
{ {
push (@errors, 'bad_doi'); push (@datacite_conversion_errors, "bad_doi=[$doi]");
} }
push (@errors, 'bad_nbn') unless ($nbn =~ m#^urn:nbn:at:at-ubw:1-\d{5}\.\d{5}\.\d{6}-\d$#); push (@datacite_conversion_errors, "bad_nbn=[$nbn]") unless ($nbn =~ m#^urn:nbn:at:at-ubw:1-\d{5}\.\d{5}\.\d{6}-\d$#);
push (@errors, 'bad_ac_number') unless ($ac_number =~ m#^AC\d{8}$#); push (@datacite_conversion_errors, "bad_ac_number=[$ac_number]") unless ($ac_number =~ m#^AC\d{8}$#);
my ($titles, $abstracts, $publication_date, $langs)= map { $th->{$_} } qw(titels abstracts publication_date languages); push (@datacite_conversion_errors, "bad_url=[$persistent_link]") unless ($persistent_link =~ m#^https://utheses.univie.ac.at/detail/\d+/?$#);
my $language= Phaidra::Utils::iso639::iso_639_2_to_1($langs->[0]);
unless (defined ($language)) my ($titles, $abstracts, $publication_date, $langs, $policies)= map { $th->{$_} } qw(titles abstracts publication_date languages policies);
my $fulltext_locked= $policies->{fulltext_locked};
my $publication_year;
if ($publication_date =~ m#^(\d{4})-?#)
{ {
$language= 'UNKNOWN'; $publication_year= $1;
push (@errors, 'language'); }
else
{
push (@datacite_conversion_errors, "bad_pub_date=[$publication_date]");
}
my $main_language;
foreach my $lang (@$langs)
{
my $language= Phaidra::Utils::iso639::iso_639_2_to_1($lang);
if (defined ($language))
{
$main_language= $language unless (defined ($main_language));
}
else
{
push (@datacite_conversion_errors, "bad_language=[$lang]");
$language= ':tba';
}
} }
my $xml= << "EOX"; my $xml= << "EOX";
...@@ -127,23 +263,51 @@ EOX ...@@ -127,23 +263,51 @@ EOX
<titles> <titles>
EOX EOX
foreach my $title (@$titles)
{
my $language= Phaidra::Utils::iso639::iso_639_2_to_1(my $tl= $title->{title_lang});
unless (defined ($language))
{
push (@datacite_conversion_errors, "bad_title_language=[$tl]");
$language= ':tba';
}
my $tt= 'titleType="TranslatedTitle" ' if ($title->{type} eq 'parallel');
my $text= xml_escape($title->{title_text});
$xml .= <<"EOX";
<title ${tt}xml:lang="$language">
$text
</title>
EOX
}
$xml .= << "EOX"; $xml .= << "EOX";
</titles> </titles>
<publisher>(:none)</publisher> <publisher>:none</publisher>
<publicationYear>2002</publicationYear> <publicationYear>$publication_year</publicationYear>
<language>$language</language> <language>$main_language</language>
<resourceType resourceTypeGeneral="Text">Thesis</resourceType> <resourceType resourceTypeGeneral="Text">Thesis</resourceType>
<alternateIdentifiers> <alternateIdentifiers>
<alternateIdentifier alternateIdentifierType="nbn">$nbn</alternateIdentifier> <alternateIdentifier alternateIdentifierType="URL">$persistent_link</alternateIdentifier>
<alternateIdentifier alternateIdentifierType="URN">$nbn</alternateIdentifier>
</alternateIdentifiers> </alternateIdentifiers>
<descriptions> <descriptions>
EOX EOX
foreach my $abstract (@$abstracts) foreach my $abstract (@$abstracts)
{ {
my $language= Phaidra::Utils::iso639::iso_639_2_to_1(my $al= $abstract->{language});
unless (defined ($language))
{
push (@datacite_conversion_errors, "bad_abstract_language=[$al]");
$language= ':tba';
}
# TODO: text should be escaped, if necessary
my $text= xml_escape($abstract->{text});
$xml .= << "EOX"; $xml .= << "EOX";
<description descriptionType="Abstract"> <description descriptionType="Abstract" xml:lang="$language">
$abstract->{text} $text
</description> </description>
EOX EOX
} }
...@@ -161,11 +325,25 @@ EOX ...@@ -161,11 +325,25 @@ EOX
nbn => $nbn, nbn => $nbn,
ac_number => $ac_number, ac_number => $ac_number,
langs => join(',', @$langs), langs => join(',', @$langs),
language => $language, language => $main_language,
persistent_link => $persistent_link, persistent_link => $persistent_link,
errors => join(',',@errors), datacite_conversion_errors => \@datacite_conversion_errors,
fulltext_locked => $fulltext_locked,
phaidra => $phaidra,
policies => $policies,
); );
(\%row, $xml); (\%row, $xml);
} }
sub xml_escape
{
my $s= shift;
$s=~ s/&/&amp;/g;
$s=~ s/</&lt;/g;
$s=~ s/>/&gt;/g;
$s;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment