Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
I
irma2
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Gerhard Gonter
irma2
Commits
a5c87c6c
Commit
a5c87c6c
authored
2 years ago
by
Gerhard Gonter
Browse files
Options
Downloads
Patches
Plain Diff
added op_mode to analyze utheses object and optionally register DOI with DataCite
parent
b5f2bd13
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
ut1.pl
+218
-40
218 additions, 40 deletions
ut1.pl
with
218 additions
and
40 deletions
ut1.pl
+
218
−
40
View file @
a5c87c6c
...
...
@@ -14,19 +14,38 @@ $Data::Dumper::Indent= 1;
use
lib
'
lib
';
use
Univie::Utheses::
API
;
use
Phaidra::Utils::
iso639
;
use
Util::
JSON
;
use
DataCite::
API
;
#
my
%LANGUAGES= ( 'deu' => 'de', 'eng' => 'en' ); not used, see Phaidra::Utils::iso639
my
@TSV_COLUMNS
=
qw( utheses_id fulltext_locked suffix doi nbn ac_number langs language persistent_link xml_fnm errors )
;
my
@TSV_COLUMNS
=
qw( utheses_id suffix doi nbn ac_number langs language persistent_link errors )
;
my
$op_mode
=
'
fetch_metadata
';
# my $op_mode= 'fetch_metadata_bulk';
my
$op_mode
=
'
analyze
';
my
$fnm_tsv
=
'
utheses/utheses_info.tsv
';
# TODO: timestamp!
my
$config_file
=
'
/etc/irma/DataCite.json
';
my
$do_register_doi
=
0
;
my
@pars
=
();
while
(
my
$arg
=
shift
(
@ARGV
))
{
if
(
$arg
=~
/^-/
)
if
(
$arg
eq
'
--
')
{
push
(
@pars
,
@ARGV
);
@ARGV
=
();
}
elsif
(
$arg
=~
/^--(.+)/
)
{
my
(
$opt
,
$val
)
=
split
('
=
',
$
1
,
2
);
if
(
$opt
eq
'
help
')
{
usage
();
}
elsif
(
$opt
eq
'
register-doi
')
{
$do_register_doi
=
1
;
}
else
{
usage
();
}
}
elsif
(
$arg
=~
/^-(.+)/
)
{
foreach
my
$opt
(
split
('',
$
1
))
{
if
(
$opt
eq
'
h
')
{
usage
();
exit
(
0
);
}
# elsif ($opt eq 'x') { $x_flag= 1; }
else
{
usage
("
unknown option
$arg
");
}
}
}
else
{
...
...
@@ -34,55 +53,150 @@ while (my $arg= shift (@ARGV))
}
}
if
(
$op_mode
eq
'
fetch_metadata
')
my
$cnf
=
Util::JSON::
read_json_file
(
$config_file
);
# print __LINE__, " cnf: ", main::Dumper ($cnf); exit(0);
my
$ut_cnf
=
$cnf
->
{
repositories
}
->
{'
utheses.univie.ac.at
'};
# print __LINE__, " ut_cnf: ", main::Dumper ($ut_cnf); exit(0);
my
$reg_cnf
=
$cnf
->
{
doi_registries
}
->
{
$ut_cnf
->
{
registry
}};
# print __LINE__, " reg_cnf: ", main::Dumper ($reg_cnf); exit(0);
my
$utapi
=
new
Univie::Utheses::
API
(
config
=>
{
api_url
=>
$ut_cnf
->
{
api_url
}
}
);
die
"
no utapi
"
unless
(
defined
(
$utapi
));
my
$reg_obj
=
new
DataCite::
API
(
config
=>
$reg_cnf
,
xmode
=>
'
test
');
die
"
no reg_obj
"
unless
(
defined
(
$reg_obj
));
if
(
$op_mode
eq
'
analyze
')
{
foreach
my
$par
(
@pars
)
{
if
(
$par
=~
m#^\d+$#
)
{
analyze_utheses_item
(
$par
);
}
}
}
elsif
(
$op_mode
eq
'
fetch_metadata_bulk
')
{
fetch_metadata
(
\
@pars
);
fetch_metadata
_bulk
(
\
@pars
);
}
exit
(
0
);
sub
fetch_metadata
sub
analyze_utheses_item
{
my
$utheses_ids
=
shift
;
my
$utheses_id
=
shift
;
my
(
$row
,
$xml
)
=
get_utheses_metadata
(
$utheses_id
);
my
$utapi
=
new
Univie::Utheses::
API
(
config
=>
{
api_url
=>
'
https://utheses-api-utheses-prod.cprod.univie.ac.at
'
}
);
print
__LINE__
,
"
utheses_id=[
$utheses_id
]
",
'
=
'
x50
,
"
\n
";
print
__LINE__
,
"
xml=[
$xml
]
\n
";
print
__LINE__
,
"
row:
",
Dumper
(
$row
);
if
(
$do_register_doi
)
{
unless
(
defined
(
$row
))
{
print
"
ATTN: can't register DOI: no utheses data found
\n
";
goto
END
;
}
unless
(
defined
(
$xml
))
{
print
"
ATTN: can't register DOI: no xml data generated
\n
";
goto
END
;
}
my
(
$doi
,
$url
,
$errors
,
$ftl
)
=
map
{
$row
->
{
$_
}
}
qw(doi persistent_link datacite_conversion_errors fulltext_locked)
;
if
(
$ftl
)
{
print
"
ATTN: can't register DOI: fulltext locked
\n
";
goto
END
;
}
if
(
@$errors
)
{
print
"
ATTN: can't register DOI
$doi
due to errors:
",
join
('
,
',
@$errors
),
"
\n
";
goto
END
;
}
if
(
$reg_obj
->
register_doi
(
$doi
,
$xml
,
$url
))
{
print
"
NOTE: register_doi doi=[
$doi
] url=[
$url
] OK
\n
";
}
else
{
print
"
ATTN: register_doi doi=[
$doi
] url=[
$url
] was not ok
\n
";
}
}
END
:
return
;
}
sub
fetch_metadata_bulk
{
my
$utheses_ids
=
shift
;
open
(
TSV
,
'
>:utf8
',
$fnm_tsv
)
or
die
;
print
TSV
join
("
\t
",
@TSV_COLUMNS
),
"
\n
";
open
(
ERRORS
,
'
>>:utf8
',
'
@errors.lst
')
or
die
;
foreach
my
$utheses_id
(
@$utheses_ids
)
{
next
if
(
$utheses_id
eq
'
utheses_id
');
# CSV column name...
print
__LINE__
,
"
utheses_id=[
$utheses_id
]
\n
";
my
$info
=
$utapi
->
getContainerPublicMetadata
(
$utheses_id
);
print
__LINE__
,
"
info:
",
Dumper
(
$info
);
my
(
$row
,
$xml
)
=
get_utheses_metadata
(
$utheses_id
);
if
(
$info
->
{
status
}
eq
'
200
'
&&
exists
(
$info
->
{
metadata
}
->
{
thesis
}
->
{
doi
}
))
if
(
defined
(
$row
))
{
my
(
$row
,
$xml
)
=
utheses2datacite
(
$info
->
{
metadata
},
$utheses_id
);
print
__LINE__
,
"
row:
",
Dumper
(
$row
);
print
__LINE__
,
"
xml=[
$xml
]
\n
";
print
TSV
join
("
\t
",
map
{
$row
->
{
$_
}
}
@TSV_COLUMNS
),
"
\n
";
if
(
defined
(
$row
->
{
doi
}))
}
}
}
sub
get_utheses_metadata
{
my
$utheses_id
=
shift
;
print
__LINE__
,
"
utheses_id=[
$utheses_id
]
\n
";
my
$info
=
$utapi
->
getContainerPublicMetadata
(
$utheses_id
);
print
__LINE__
,
"
info:
",
Dumper
(
$info
);
my
(
$row
,
$xml
);
if
(
$info
->
{
status
}
eq
'
200
')
{
(
$row
,
$xml
)
=
utheses2datacite
(
$info
,
$utheses_id
);
# print __LINE__, " row: ", Dumper($row);
if
(
defined
(
$row
->
{
doi
}))
{
$row
->
{
xml_fnm
}
=
my
$xml_fnm
=
'
utheses/DataCite_XML/
'
.
$row
->
{
doi
}
.
'
.xml
';
# print __LINE__, " DataCite_XML=[$xml_fnm] xml=[$xml]\n";
if
(
defined
(
$xml
)
&&
defined
(
$row
->
{
xml_fnm
})
&&
!
@
{
$row
->
{
datacite_conversion_errors
}})
{
my
$xml_fnm
=
'
utheses/DataCite_XML/
'
.
$row
->
{
doi
}
.
'
.xml
';
open
(
XML
,
'
>:utf8
',
$xml_fnm
)
or
die
;
open
(
XML
,
'
>:utf8
',
$row
->
{
xml_fnm
})
or
die
;
# TODO: or do something else...
print
XML
$xml
;
close
(
XML
);
}
}
}
else
{
printf
ERRORS
("
utheses_id=%d status=%s
\n
",
$utheses_id
,
$info
->
{
status
});
}
return
(
$row
,
$xml
);
}
sub
utheses2datacite
{
my
$
md
=
shift
;
my
$
info
=
shift
;
my
$utheses_id
=
shift
;
my
@errors
;
my
@
datacite_conversion_
errors
;
my
(
$th
,
$authors
)
=
map
{
$md
->
{
$_
}
}
qw(thesis authors)
;
my
(
$md
)
=
map
{
$info
->
{
$_
}
}
qw(metadata)
;
my
(
$th
,
$authors
,
$phaidra
)
=
map
{
$md
->
{
$_
}
}
qw(thesis authors phaidra)
;
my
(
$doi
,
$nbn
,
$ac_number
,
$persistent_link
)
=
map
{
$th
->
{
$_
}
}
qw(doi urn ac_number persistent_link)
;
# TODO: check doi, nbn, ac_number for syntactic validity
...
...
@@ -93,17 +207,39 @@ sub utheses2datacite
}
else
{
push
(
@errors
,
'
bad_doi
'
);
push
(
@
datacite_conversion_
errors
,
"
bad_doi
=[
$doi
]
"
);
}
push
(
@errors
,
'
bad_nbn
'
)
unless
(
$nbn
=~
m#^urn:nbn:at:at-ubw:1-\d{5}\.\d{5}\.\d{6}-\d$#
);
push
(
@errors
,
'
bad_ac_number
'
)
unless
(
$ac_number
=~
m#^AC\d{8}$#
);
push
(
@
datacite_conversion_
errors
,
"
bad_nbn
=[
$nbn
]
"
)
unless
(
$nbn
=~
m#^urn:nbn:at:at-ubw:1-\d{5}\.\d{5}\.\d{6}-\d$#
);
push
(
@
datacite_conversion_
errors
,
"
bad_ac_number
=[
$ac_number
]
"
)
unless
(
$ac_number
=~
m#^AC\d{8}$#
);
my
(
$titles
,
$abstracts
,
$publication_date
,
$langs
)
=
map
{
$th
->
{
$_
}
}
qw(titels abstracts publication_date languages)
;
my
$language
=
Phaidra::Utils::iso639::
iso_639_2_to_1
(
$langs
->
[
0
]);
unless
(
defined
(
$language
))
push
(
@datacite_conversion_errors
,
"
bad_url=[
$persistent_link
]
")
unless
(
$persistent_link
=~
m#^https://utheses.univie.ac.at/detail/\d+/?$#
);
my
(
$titles
,
$abstracts
,
$publication_date
,
$langs
,
$policies
)
=
map
{
$th
->
{
$_
}
}
qw(titles abstracts publication_date languages policies)
;
my
$fulltext_locked
=
$policies
->
{
fulltext_locked
};
my
$publication_year
;
if
(
$publication_date
=~
m#^(\d{4})-?#
)
{
$language
=
'
UNKNOWN
';
push
(
@errors
,
'
language
');
$publication_year
=
$
1
;
}
else
{
push
(
@datacite_conversion_errors
,
"
bad_pub_date=[
$publication_date
]
");
}
my
$main_language
;
foreach
my
$lang
(
@$langs
)
{
my
$language
=
Phaidra::Utils::iso639::
iso_639_2_to_1
(
$lang
);
if
(
defined
(
$language
))
{
$main_language
=
$language
unless
(
defined
(
$main_language
));
}
else
{
push
(
@datacite_conversion_errors
,
"
bad_language=[
$lang
]
");
$language
=
'
:tba
';
}
}
my
$xml
=
<<
"
EOX
";
...
...
@@ -127,23 +263,51 @@ EOX
<
titles
>
EOX
foreach
my
$title
(
@$titles
)
{
my
$language
=
Phaidra::Utils::iso639::
iso_639_2_to_1
(
my
$tl
=
$title
->
{
title_lang
});
unless
(
defined
(
$language
))
{
push
(
@datacite_conversion_errors
,
"
bad_title_language=[
$tl
]
");
$language
=
'
:tba
';
}
my
$tt
=
'
titleType="TranslatedTitle"
'
if
(
$title
->
{
type
}
eq
'
parallel
');
my
$text
=
xml_escape
(
$title
->
{
title_text
});
$xml
.=
<<"EOX";
<title ${tt}xml:lang="$language">
$text
</title>
EOX
}
$xml
.=
<<
"
EOX
";
</
titles
>
<
publisher
>
(
:
none
)
</
publisher
>
<
publicationYear
>
2002
</
publicationYear
>
<
language
>
$language
</
language
>
<
publisher
>
:
none
</
publisher
>
<
publicationYear
>
$publication_year
</
publicationYear
>
<
language
>
$
main_
language
</
language
>
<
resourceType
resourceTypeGeneral
=
"
Text
"
>
Thesis
</
resourceType
>
<
alternateIdentifiers
>
<
alternateIdentifier
alternateIdentifierType
=
"
nbn
"
>
$nbn
</
alternateIdentifier
>
<
alternateIdentifier
alternateIdentifierType
=
"
URL
"
>
$persistent_link
</
alternateIdentifier
>
<
alternateIdentifier
alternateIdentifierType
=
"
URN
"
>
$nbn
</
alternateIdentifier
>
</
alternateIdentifiers
>
<
descriptions
>
EOX
foreach
my
$abstract
(
@$abstracts
)
{
my
$language
=
Phaidra::Utils::iso639::
iso_639_2_to_1
(
my
$al
=
$abstract
->
{
language
});
unless
(
defined
(
$language
))
{
push
(
@datacite_conversion_errors
,
"
bad_abstract_language=[
$al
]
");
$language
=
'
:tba
';
}
# TODO: text should be escaped, if necessary
my
$text
=
xml_escape
(
$abstract
->
{
text
});
$xml
.=
<<
"
EOX
";
<
description
descriptionType
=
"
Abstract
"
>
$
abstract
->
{
text
}
<
description
descriptionType
=
"
Abstract
"
xml:lang
=
"
$language
"
>
$text
</
description
>
EOX
}
...
...
@@ -161,11 +325,25 @@ EOX
nbn
=>
$nbn
,
ac_number
=>
$ac_number
,
langs
=>
join
('
,
',
@$langs
),
language
=>
$language
,
language
=>
$
main_
language
,
persistent_link
=>
$persistent_link
,
errors
=>
join
('
,
',
@errors
),
datacite_conversion_errors
=>
\
@datacite_conversion_errors
,
fulltext_locked
=>
$fulltext_locked
,
phaidra
=>
$phaidra
,
policies
=>
$policies
,
);
(
\
%row
,
$xml
);
}
sub
xml_escape
{
my
$s
=
shift
;
$s
=~
s/&/&/g
;
$s
=~
s/</</g
;
$s
=~
s/>/>/g
;
$s
;
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment