Skip to content
Snippets Groups Projects
Commit a5c87c6c authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

added op_mode to analyze utheses object and optionally register DOI with DataCite

parent b5f2bd13
No related branches found
No related tags found
No related merge requests found
......@@ -14,19 +14,38 @@ $Data::Dumper::Indent= 1;
use lib 'lib';
use Univie::Utheses::API;
use Phaidra::Utils::iso639;
use Util::JSON;
use DataCite::API;
# my %LANGUAGES= ( 'deu' => 'de', 'eng' => 'en' ); not used, see Phaidra::Utils::iso639
my @TSV_COLUMNS= qw( utheses_id fulltext_locked suffix doi nbn ac_number langs language persistent_link xml_fnm errors );
my @TSV_COLUMNS= qw( utheses_id suffix doi nbn ac_number langs language persistent_link errors );
my $op_mode= 'fetch_metadata';
# my $op_mode= 'fetch_metadata_bulk';
my $op_mode= 'analyze';
my $fnm_tsv= 'utheses/utheses_info.tsv'; # TODO: timestamp!
my $config_file= '/etc/irma/DataCite.json';
my $do_register_doi= 0;
my @pars= ();
while (my $arg= shift (@ARGV))
{
if ($arg =~ /^-/)
if ($arg eq '--') { push (@pars, @ARGV); @ARGV= (); }
elsif ($arg =~ /^--(.+)/)
{
my ($opt, $val)= split ('=', $1, 2);
if ($opt eq 'help') { usage(); }
elsif ($opt eq 'register-doi') { $do_register_doi= 1; }
else { usage(); }
}
elsif ($arg =~ /^-(.+)/)
{
foreach my $opt (split ('', $1))
{
if ($opt eq 'h') { usage(); exit (0); }
# elsif ($opt eq 'x') { $x_flag= 1; }
else { usage("unknown option $arg"); }
}
}
else
{
......@@ -34,55 +53,150 @@ while (my $arg= shift (@ARGV))
}
}
if ($op_mode eq 'fetch_metadata')
my $cnf= Util::JSON::read_json_file ($config_file);
# print __LINE__, " cnf: ", main::Dumper ($cnf); exit(0);
my $ut_cnf= $cnf->{repositories}->{'utheses.univie.ac.at'};
# print __LINE__, " ut_cnf: ", main::Dumper ($ut_cnf); exit(0);
my $reg_cnf= $cnf->{doi_registries}->{$ut_cnf->{registry}};
# print __LINE__, " reg_cnf: ", main::Dumper ($reg_cnf); exit(0);
my $utapi= new Univie::Utheses::API( config => { api_url => $ut_cnf->{api_url} } );
die "no utapi" unless (defined ($utapi));
my $reg_obj= new DataCite::API (config => $reg_cnf, xmode => 'test');
die "no reg_obj" unless (defined ($reg_obj));
if ($op_mode eq 'analyze')
{
foreach my $par (@pars)
{
if ($par =~ m#^\d+$#) { analyze_utheses_item($par); }
}
}
elsif ($op_mode eq 'fetch_metadata_bulk')
{
fetch_metadata(\@pars);
fetch_metadata_bulk(\@pars);
}
exit(0);
sub fetch_metadata
sub analyze_utheses_item
{
my $utheses_ids= shift;
my $utheses_id= shift;
my ($row, $xml)= get_utheses_metadata($utheses_id);
my $utapi= new Univie::Utheses::API( config => { api_url => 'https://utheses-api-utheses-prod.cprod.univie.ac.at' } );
print __LINE__, " utheses_id=[$utheses_id] ", '='x50, "\n";
print __LINE__, " xml=[$xml]\n";
print __LINE__, " row: ", Dumper($row);
if ($do_register_doi)
{
unless (defined ($row))
{
print "ATTN: can't register DOI: no utheses data found\n";
goto END;
}
unless (defined ($xml))
{
print "ATTN: can't register DOI: no xml data generated\n";
goto END;
}
my ($doi, $url, $errors, $ftl)= map { $row->{$_} } qw(doi persistent_link datacite_conversion_errors fulltext_locked);
if ($ftl)
{
print "ATTN: can't register DOI: fulltext locked\n";
goto END;
}
if (@$errors)
{
print "ATTN: can't register DOI $doi due to errors: ", join(', ', @$errors), "\n";
goto END;
}
if ($reg_obj->register_doi ($doi, $xml, $url))
{
print "NOTE: register_doi doi=[$doi] url=[$url] OK\n";
}
else
{
print "ATTN: register_doi doi=[$doi] url=[$url] was not ok\n";
}
}
END:
return;
}
sub fetch_metadata_bulk
{
my $utheses_ids= shift;
open (TSV, '>:utf8', $fnm_tsv) or die;
print TSV join("\t", @TSV_COLUMNS), "\n";
open (ERRORS, '>>:utf8', '@errors.lst') or die;
foreach my $utheses_id (@$utheses_ids)
{
next if ($utheses_id eq 'utheses_id'); # CSV column name...
print __LINE__, " utheses_id=[$utheses_id]\n";
my $info= $utapi->getContainerPublicMetadata($utheses_id);
print __LINE__, " info: ", Dumper($info);
my ($row, $xml)= get_utheses_metadata($utheses_id);
if ($info->{status} eq '200' && exists ($info->{metadata}->{thesis}->{doi}))
if (defined ($row))
{
my ($row, $xml)= utheses2datacite($info->{metadata}, $utheses_id);
print __LINE__, " row: ", Dumper($row);
print __LINE__, " xml=[$xml]\n";
print TSV join("\t", map { $row->{$_} } @TSV_COLUMNS), "\n";
if (defined ($row->{doi}))
}
}
}
sub get_utheses_metadata
{
my $utheses_id= shift;
print __LINE__, " utheses_id=[$utheses_id]\n";
my $info= $utapi->getContainerPublicMetadata($utheses_id);
print __LINE__, " info: ", Dumper($info);
my ($row, $xml);
if ($info->{status} eq '200')
{
($row, $xml)= utheses2datacite($info, $utheses_id);
# print __LINE__, " row: ", Dumper($row);
if (defined ($row->{doi}))
{
$row->{xml_fnm}= my $xml_fnm= 'utheses/DataCite_XML/'. $row->{doi}. '.xml';
# print __LINE__, " DataCite_XML=[$xml_fnm] xml=[$xml]\n";
if (defined ($xml) && defined ($row->{xml_fnm}) && !@{$row->{datacite_conversion_errors}})
{
my $xml_fnm= 'utheses/DataCite_XML/'.$row->{doi}.'.xml';
open (XML, '>:utf8', $xml_fnm) or die;
open (XML, '>:utf8', $row->{xml_fnm}) or die; # TODO: or do something else...
print XML $xml;
close (XML);
}
}
}
else
{
printf ERRORS ("utheses_id=%d status=%s\n", $utheses_id, $info->{status});
}
return ($row, $xml);
}
sub utheses2datacite
{
my $md= shift;
my $info= shift;
my $utheses_id= shift;
my @errors;
my @datacite_conversion_errors;
my ($th, $authors)= map { $md->{$_} } qw(thesis authors);
my ($md)= map { $info->{$_} } qw(metadata);
my ($th, $authors, $phaidra)= map { $md->{$_} } qw(thesis authors phaidra);
my ($doi, $nbn, $ac_number, $persistent_link)= map { $th->{$_} } qw(doi urn ac_number persistent_link);
# TODO: check doi, nbn, ac_number for syntactic validity
......@@ -93,17 +207,39 @@ sub utheses2datacite
}
else
{
push (@errors, 'bad_doi');
push (@datacite_conversion_errors, "bad_doi=[$doi]");
}
push (@errors, 'bad_nbn') unless ($nbn =~ m#^urn:nbn:at:at-ubw:1-\d{5}\.\d{5}\.\d{6}-\d$#);
push (@errors, 'bad_ac_number') unless ($ac_number =~ m#^AC\d{8}$#);
push (@datacite_conversion_errors, "bad_nbn=[$nbn]") unless ($nbn =~ m#^urn:nbn:at:at-ubw:1-\d{5}\.\d{5}\.\d{6}-\d$#);
push (@datacite_conversion_errors, "bad_ac_number=[$ac_number]") unless ($ac_number =~ m#^AC\d{8}$#);
my ($titles, $abstracts, $publication_date, $langs)= map { $th->{$_} } qw(titels abstracts publication_date languages);
my $language= Phaidra::Utils::iso639::iso_639_2_to_1($langs->[0]);
unless (defined ($language))
push (@datacite_conversion_errors, "bad_url=[$persistent_link]") unless ($persistent_link =~ m#^https://utheses.univie.ac.at/detail/\d+/?$#);
my ($titles, $abstracts, $publication_date, $langs, $policies)= map { $th->{$_} } qw(titles abstracts publication_date languages policies);
my $fulltext_locked= $policies->{fulltext_locked};
my $publication_year;
if ($publication_date =~ m#^(\d{4})-?#)
{
$language= 'UNKNOWN';
push (@errors, 'language');
$publication_year= $1;
}
else
{
push (@datacite_conversion_errors, "bad_pub_date=[$publication_date]");
}
my $main_language;
foreach my $lang (@$langs)
{
my $language= Phaidra::Utils::iso639::iso_639_2_to_1($lang);
if (defined ($language))
{
$main_language= $language unless (defined ($main_language));
}
else
{
push (@datacite_conversion_errors, "bad_language=[$lang]");
$language= ':tba';
}
}
my $xml= << "EOX";
......@@ -127,23 +263,51 @@ EOX
<titles>
EOX
foreach my $title (@$titles)
{
my $language= Phaidra::Utils::iso639::iso_639_2_to_1(my $tl= $title->{title_lang});
unless (defined ($language))
{
push (@datacite_conversion_errors, "bad_title_language=[$tl]");
$language= ':tba';
}
my $tt= 'titleType="TranslatedTitle" ' if ($title->{type} eq 'parallel');
my $text= xml_escape($title->{title_text});
$xml .= <<"EOX";
<title ${tt}xml:lang="$language">
$text
</title>
EOX
}
$xml .= << "EOX";
</titles>
<publisher>(:none)</publisher>
<publicationYear>2002</publicationYear>
<language>$language</language>
<publisher>:none</publisher>
<publicationYear>$publication_year</publicationYear>
<language>$main_language</language>
<resourceType resourceTypeGeneral="Text">Thesis</resourceType>
<alternateIdentifiers>
<alternateIdentifier alternateIdentifierType="nbn">$nbn</alternateIdentifier>
<alternateIdentifier alternateIdentifierType="URL">$persistent_link</alternateIdentifier>
<alternateIdentifier alternateIdentifierType="URN">$nbn</alternateIdentifier>
</alternateIdentifiers>
<descriptions>
EOX
foreach my $abstract (@$abstracts)
{
my $language= Phaidra::Utils::iso639::iso_639_2_to_1(my $al= $abstract->{language});
unless (defined ($language))
{
push (@datacite_conversion_errors, "bad_abstract_language=[$al]");
$language= ':tba';
}
# TODO: text should be escaped, if necessary
my $text= xml_escape($abstract->{text});
$xml .= << "EOX";
<description descriptionType="Abstract">
$abstract->{text}
<description descriptionType="Abstract" xml:lang="$language">
$text
</description>
EOX
}
......@@ -161,11 +325,25 @@ EOX
nbn => $nbn,
ac_number => $ac_number,
langs => join(',', @$langs),
language => $language,
language => $main_language,
persistent_link => $persistent_link,
errors => join(',',@errors),
datacite_conversion_errors => \@datacite_conversion_errors,
fulltext_locked => $fulltext_locked,
phaidra => $phaidra,
policies => $policies,
);
(\%row, $xml);
}
sub xml_escape
{
my $s= shift;
$s=~ s/&/&amp;/g;
$s=~ s/</&lt;/g;
$s=~ s/>/&gt;/g;
$s;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment