Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
I
irma2
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Gerhard Gonter
irma2
Commits
37d62ee1
Commit
37d62ee1
authored
Sep 26, 2022
by
Gerhard Gonter
Browse files
Options
Downloads
Patches
Plain Diff
additional functionality for ut1.pl
parent
4a0a8096
No related branches found
No related tags found
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
lib/DataCite/API.pm
+3
-3
3 additions, 3 deletions
lib/DataCite/API.pm
lib/Univie/Utheses/API.pm
+39
-19
39 additions, 19 deletions
lib/Univie/Utheses/API.pm
ut1.pl
+549
-71
549 additions, 71 deletions
ut1.pl
with
591 additions
and
93 deletions
lib/DataCite/API.pm
+
3
−
3
View file @
37d62ee1
...
...
@@ -52,7 +52,7 @@ sub register_doi
unless
(
$code1
=~
m#^20[01]#
)
{
print
STDERR
"
ATTN: register_doi POST metadata returned code1=[
$code1
] res1=[
$res1
]
\n
";
return
undef
;
return
(
0
,
$res1
,
undef
)
;
}
my
$doi_reg
=
<<"EOX";
...
...
@@ -66,10 +66,10 @@ EOX
unless
(
$code2
=~
m#^20[01]#
)
{
print
STDERR
"
ATTN: register_doi POST doi returned code2=[
$code2
] res2=[
$res2
]
\n
";
return
undef
;
return
(
0
,
$res1
,
$res2
)
;
}
1
;
(
1
,
$res1
,
$res2
)
;
}
sub
datacite_request
...
...
This diff is collapsed.
Click to expand it.
lib/Univie/Utheses/API.pm
+
39
−
19
View file @
37d62ee1
...
...
@@ -20,24 +20,22 @@ sub getContainerPublicMetadata
my
$self
=
shift
;
my
$utheses_id
=
shift
;
my
(
$code1
,
$res1
)
=
$self
->
utheses_request
('
GET
',
'
container/get/public
',
$utheses_id
);
print
__LINE__
,
"
code1=[
$code1
] res1=[
$res1
]
\n
";
my
(
$code1
,
$res1
,
$info
)
=
$self
->
utheses_request
('
GET
',
'
container/get/public
',
$utheses_id
);
print
__LINE__
,
"
code1=[
$code1
] res1=[
$res1
]
info=[
$info
]
\n
";
my
$info
;
if
(
$code1
=~
m#^2#
)
{
eval
{
$info
=
from_json
(
$res1
);
};
if
(
$@
)
{
die
$@
;
}
# TODO: Error handling!
$info
;
}
sub
getPendingDoisCreateRequest
{
my
$self
=
shift
;
my
(
$code1
,
$res1
,
$info
)
=
$self
->
utheses_request
('
GET
',
'
doi/get/createRequest
',
undef
,
);
print
__LINE__
,
"
code1=[
$code1
] res1=[
$res1
] info=[
$info
]
\n
";
# TODO: Error handling!
$info
;
(
wantarray
)
?
(
$code1
,
$res1
,
$info
)
:
$info
;
}
sub
utheses_request
...
...
@@ -45,20 +43,42 @@ sub utheses_request
my
$self
=
shift
;
my
$method
=
shift
;
my
$what
=
shift
;
my
$
id
=
shift
;
my
$
par
=
shift
;
my
(
$api_url
)
=
map
{
$self
->
{
config
}
->
{
$_
}
}
qw(api_url)
;
my
(
$api_url
,
$headers
)
=
map
{
$self
->
{
config
}
->
{
$_
}
}
qw(api_url
headers
)
;
my
$req_url
=
join
('
/
',
$api_url
,
$what
,
$
id
);
my
$req_url
=
join
('
/
',
$api_url
,
$what
,
$
par
);
print
__LINE__
,
"
req_url=[
$req_url
]
\n
";
my
$req
=
HTTP::
Request
->
new
(
$method
=>
$req_url
);
if
(
defined
(
$headers
))
{
foreach
my
$h
(
@$headers
)
{
$req
->
header
(
@$h
);
}
}
my
$ua
=
LWP::
UserAgent
->
new
;
my
$res
=
$ua
->
request
(
$req
);
# print __LINE__, " res: ", main::Dumper($res);
my
$txt
=
decode
("
utf8
",
$res
->
content
());
return
(
$res
->
code
(),
$txt
);
my
$code
=
$res
->
code
();
my
$info
;
if
(
$code
=~
m#^2#
)
{
eval
{
$info
=
from_json
(
$txt
);
};
if
(
$@
)
{
die
$@
;
# TODO: this should be handled mor gracefully!
}
}
return
(
$code
,
$txt
,
$info
);
}
1
;
...
...
This diff is collapsed.
Click to expand it.
ut1.pl
+
549
−
71
View file @
37d62ee1
...
...
@@ -3,6 +3,7 @@
use
strict
;
use
FileHandle
;
use
utf8
;
binmode
(
STDOUT
,
'
:utf8
'
);
autoflush
STDOUT
1
;
binmode
(
STDERR
,
'
:utf8
'
);
autoflush
STDERR
1
;
...
...
@@ -11,21 +12,36 @@ binmode( STDIN, ':utf8' );
use
Data::
Dumper
;
$
Data::Dumper::
Indent
=
1
;
use
Digest::MD5::
File
qw(file_md5_hex)
;
use
Util::
ts
;
use
Util::
JSON
;
use
Redmine::DB::
MySQL
;
use
Phaidra::Utils::
iso639
;
use
lib
'
lib
';
use
Univie::Utheses::
API
;
use
Phaidra::Utils::
iso639
;
use
Util::
JSON
;
use
DataCite::
API
;
use
IRMA::
NA
;
use
Alma::
MARC_Extractor
;
my
@TSV_COLUMNS
=
qw( utheses_id fulltext_locked suffix doi nbn ac_number langs language persistent_link xml_fnm errors )
;
# my $op_mode= 'fetch_metadata_bulk';
my
$op_mode
=
'
analyze
';
my
$op_mode
=
'
process
';
my
$fnm_tsv
=
'
utheses/utheses_info.tsv
';
# TODO: timestamp!
my
$config_file
=
'
/etc/irma/DataCite.json
';
# TODO: make these configurable too!
my
$agent_config_file
=
'
/etc/irma/pidagent.json
';
my
$MAX_MARC_AGE
=
86400
*
60
;
my
$MAX_MARC_REQUESTS
=
10_000
;
my
$do_register_doi
=
0
;
my
$agent_name
=
'
pidagent
';
my
$agent_id
=
$$
;
my
$fix_problems
=
0
;
my
@pars
=
();
while
(
my
$arg
=
shift
(
@ARGV
))
...
...
@@ -36,6 +52,7 @@ while (my $arg= shift (@ARGV))
my
(
$opt
,
$val
)
=
split
('
=
',
$
1
,
2
);
if
(
$opt
eq
'
help
')
{
usage
();
}
elsif
(
$opt
eq
'
register-doi
')
{
$do_register_doi
=
1
;
}
elsif
(
$opt
eq
'
fix
')
{
$fix_problems
=
1
;
}
else
{
usage
();
}
}
elsif
(
$arg
=~
/^-(.+)/
)
...
...
@@ -53,25 +70,84 @@ while (my $arg= shift (@ARGV))
}
}
my
$cnf
=
Util::JSON::
read_json_file
(
$config_file
);
# print __LINE__, " cnf: ", main::Dumper ($cnf); exit(0);
my
$ut_cnf
=
$cnf
->
{
repositories
}
->
{'
utheses.univie.ac.at
'};
# load configuration
my
$agent_cnf
=
Util::JSON::
read_json_file
(
$agent_config_file
);
# print __LINE__, " agent_cnf: ", main::Dumper ($agent_cnf); exit(0);
my
$dc_cnf
=
Util::JSON::
read_json_file
(
$agent_cnf
->
{
DataCite_config
});
# print __LINE__, " dc_cnf: ", main::Dumper ($dc_cnf); exit(0);
my
$ut_cnf
=
$dc_cnf
->
{
repositories
}
->
{'
utheses.univie.ac.at
'};
# print __LINE__, " ut_cnf: ", main::Dumper ($ut_cnf); exit(0);
my
$reg_cnf
=
$cnf
->
{
doi_registries
}
->
{
$ut_cnf
->
{
registry
}};
# print __LINE__, " reg_cnf: ", main::Dumper ($reg_cnf); exit(0);
my
$
dc_
reg_cnf
=
$
dc_
cnf
->
{
doi_registries
}
->
{
my
$dc_cnf_name
=
$ut_cnf
->
{
registry
}};
# print __LINE__, "
dc_
reg_cnf: ", main::Dumper ($
dc_
reg_cnf); exit(0);
my
$utapi
=
new
Univie::Utheses::
API
(
config
=>
{
api_url
=>
$ut_cnf
->
{
api_url
}
}
);
die
"
no utapi
"
unless
(
defined
(
$utapi
));
# prepare APIs
my
$ut_api
=
new
Univie::Utheses::
API
(
config
=>
$ut_cnf
->
{
api_config
}
);
die
"
no ut_api
"
unless
(
defined
(
$ut_api
));
my
$reg_obj
=
new
DataCite::
API
(
config
=>
$reg_cnf
,
xmode
=>
'
test
');
my
$reg_obj
=
new
DataCite::
API
(
config
=>
$
dc_
reg_cnf
,
xmode
=>
'
test
');
die
"
no reg_obj
"
unless
(
defined
(
$reg_obj
));
if
(
$op_mode
eq
'
analyze
')
# get handles for various databases
my
$marc_db
=
IRMA::db::
get_any_db
(
$agent_cnf
,
'
marc_database
');
my
$agent_db
=
IRMA::db::
get_any_db
(
$agent_cnf
,
'
pidagent_database
');
print
__LINE__
,
"
agent_db=[
$agent_db
]
\n
";
my
@marc_fields
=
qw(ac_number mms_id fetched lib_code)
;
my
$mex
=
Alma::
MARC_Extractor
->
new
(
\
@marc_fields
);
my
$running
=
0
;
if
(
$op_mode
eq
'
process
')
{
do
{
while
(
my
$par
=
shift
(
@pars
))
{
if
(
$par
=~
m#^(\d+)$#
||
$par
=~
m#https?://utheses.univie.ac.at/detail/(\d+)[\#/]?#
)
{
my
$utheses_id
=
$
1
;
my
@actions
=
process_utheses_item
(
$utheses_id
);
print
__LINE__
,
"
process_utheses_item(
$utheses_id
) ==> [
",
join
('
,
',
@actions
),
"
]
\n
";
}
elsif
(
$par
=~
m#^(\d+)\.\.(\d+)$#
||
$par
=~
m#^(\d+)\-(\d+)$#
||
$par
=~
m#(blk)(\d+)$#
)
{
my
(
$start
,
$end
)
=
(
$
1
,
$
2
);
(
$start
,
$end
)
=
(
$end
*
100
,
$end
*
100
+
99
)
if
(
$start
eq
'
blk
');
my
$item_count
=
$end
-
$start
;
if
(
$start
<=
$end
&&
((
$item_count
<=
2000
&&
$do_register_doi
)
||
(
$item_count
<=
5000
&&
!
$do_register_doi
)))
{
foreach
(
my
$utheses_id
=
$start
;
$utheses_id
<=
$end
;
$utheses_id
++
)
{
push
(
@pars
,
"
$utheses_id
");
# cast to string!
}
}
}
elsif
(
$par
=~
m#^AC\d{8}$#
)
{
foreach
my
$par
(
@pars
)
my
@actions
=
process_ac_number
(
$par
);
print
__LINE__
,
"
process_ac_number(
$par
) ==> [
",
join
('
,
',
@actions
),
"
]
\n
";
}
elsif
(
$par
eq
'
queue
')
{
my
@utheses_ids
=
get_job_from_queue
();
print
__LINE__
,
'
queued utheses_ids:
',
join
('
,
',
@utheses_ids
),
"
\n
";
sleep
(
10
);
push
(
@pars
,
@utheses_ids
,
'
cleanup
');
}
elsif
(
$par
eq
'
cleanup
')
{
if
(
$par
=~
m#^\d+$#
)
{
analyze_utheses_item
(
$par
);
}
cleanup_queue
();
}
elsif
(
$par
eq
'
gpdcr
')
{
my
@utheses_ids
=
gpdcr
();
# push (@pars, @utheses_ids);
}
sleep
(
2
)
if
(
@pars
);
}
}
while
(
$running
);
}
elsif
(
$op_mode
eq
'
fetch_metadata_bulk
')
{
...
...
@@ -79,55 +155,372 @@ elsif ($op_mode eq 'fetch_metadata_bulk')
}
exit
(
0
);
sub
analyze_utheses_item
sub
gpdcr
{
my
(
$status
,
$txt
,
$info
)
=
$ut_api
->
getPendingDoisCreateRequest
();
print
__LINE__
,
"
gpdcr: info:
",
Dumper
(
$info
);
my
@utheses_ids
=
();
if
(
$status
eq
'
200
')
{
my
$p
=
$info
->
{
pendingDois
};
if
(
defined
(
$p
)
&&
ref
(
$p
)
eq
'
HASH
')
{
push
(
@utheses_ids
,
map
{
$p
->
{
$_
}
->
{
utheses_id
}
}
keys
%$p
);
}
else
{
die
"
unexpected type of pendingDois
";
}
}
@utheses_ids
;
}
sub
cleanup_queue
{
my
$q_col
=
$agent_db
->
get_collection
('
queue
');
my
$j
=
$q_col
->
find_one
({
status
=>
"
in_progress
",
agent_name
=>
$agent_name
,
agent_id
=>
$agent_id
});
if
(
defined
(
$j
))
{
print
__LINE__
,
"
finishing old job
",
Dumper
(
$j
);
$q_col
->
update
({
_id
=>
$j
->
{
_id
}},
{
'
$set
'
=>
{
status
=>
"
done
",
agent_name
=>
$agent_name
,
agent_id
=>
$agent_id
}});
sleep
(
10
);
}
}
sub
get_job_from_queue
{
my
$q_col
=
$agent_db
->
get_collection
('
queue
');
my
$j
=
$q_col
->
find_one
({
status
=>
"
in_progress
",
agent_name
=>
$agent_name
});
if
(
defined
(
$j
))
{
print
__LINE__
,
"
resuming old job
",
Dumper
(
$j
);
}
else
{
$j
=
$q_col
->
find_one
({
status
=>
"
new
"});
}
return
()
unless
(
defined
(
$j
));
print
__LINE__
,
"
found new job
",
Dumper
(
$j
);
$q_col
->
update
({
_id
=>
$j
->
{
_id
}},
{
'
$set
'
=>
{
status
=>
"
in_progress
",
agent_name
=>
$agent_name
,
agent_id
=>
$agent_id
}});
my
@ut_ids
;
if
(
exists
(
$j
->
{
utheses_ids
}))
{
push
(
@ut_ids
,
@
{
$j
->
{
utheses_ids
}});
}
if
(
exists
(
$j
->
{
utheses_id
}))
{
push
(
@ut_ids
,
$j
->
{
utheses_id
});
}
return
@ut_ids
;
}
sub
process_utheses_item
{
my
$utheses_id
=
shift
;
my
(
$row
,
$xml
)
=
get_utheses_metadata
(
$utheses_id
);
print
__LINE__
,
"
process_utheses_item: utheses_id=[
$utheses_id
]
",
'
=
'
x50
,
"
\n
";
my
$register_doi_ok
=
1
;
my
$row
=
{};
my
(
$error_code
,
$status
,
$xml
)
=
get_utheses_metadata
(
$row
,
$utheses_id
);
print
__LINE__
,
"
utheses_id=[
$utheses_id
]
",
'
=
'
x50
,
"
\n
";
print
__LINE__
,
"
error_code=[
$error_code
] status=[
$status
]
\n
";
print
__LINE__
,
"
xml=[
$xml
]
\n
";
print
__LINE__
,
"
row:
",
Dumper
(
$row
);
if
(
$do_register_doi
)
my
@actions
=
('
fetched_utheses
');
unless
(
$status
eq
'
200
')
{
push
(
@actions
,
'
no_utheses_record
');
return
@actions
;
}
unless
(
$error_code
eq
'
ok
')
{
push
(
@actions
,
"
error_code=[
$error_code
]
");
report_problem
(
{
area
=>
'
utheses
',
problem
=>
'
error_code
',
utheses_id
=>
$utheses_id
,
error_code
=>
$error_code
}
);
$register_doi_ok
=
0
;
}
my
@x
=
analyze_marc_record
(
$row
);
push
(
@actions
,
@x
);
unless
(
defined
(
$agent_db
))
{
return
@actions
;
}
my
$ut_col
=
$agent_db
->
get_collection
('
utheses
');
if
(
$fix_problems
)
{
remove_problem
('
utheses
',
$row
->
{
utheses_id
});
}
# first: compare existing utheses record (if it exists) with data from utheses
my
$ut_data
=
$ut_col
->
find_one
(
{
utheses_id
=>
$row
->
{
utheses_id
}
}
);
if
(
defined
(
$ut_data
))
{
my
@cmp_fields
=
qw(doi urn nbn ac_number)
;
my
@problems
=
();
foreach
my
$f
(
@cmp_fields
)
{
if
(
exists
(
$ut_data
->
{
$f
})
&&
defined
(
$ut_data
->
{
$f
})
&&
$row
->
{
$f
}
ne
$ut_data
->
{
$f
})
{
push
(
@problems
,
{
problem
=>
'
missmatch
',
field
=>
$f
,
recorded
=>
$ut_data
->
{
$f
},
found
=>
$row
->
{
$f
}
});
}
}
if
(
@problems
)
{
report_problem
(
{
area
=>
'
utheses
',
problem
=>
'
utheses_data_missmatch
',
utheses_id
=>
$utheses_id
,
problems
=>
\
@problems
}
);
push
(
@actions
,
'
problem report utheses
');
return
(
@actions
);
}
}
# second: update utheses record in database
my
$res_upd
=
$ut_col
->
update
(
{
utheses_id
=>
$row
->
{
utheses_id
}
},
$row
,
{
upsert
=>
1
}
);
print
__LINE__
,
"
res_upd:
",
Dumper
(
$res_upd
);
my
(
$ok
,
$verdict
,
$msg1
,
$msg2
);
if
(
$do_register_doi
&&
$register_doi_ok
)
{
if
(
$fix_problems
)
{
remove_problem
('
datacite
',
$row
->
{
utheses_id
},
'
utheses_id
');
remove_problem
('
datacite
',
$row
->
{
doi
});
}
my
$dc_col
=
$agent_db
->
get_collection
('
datacite
');
my
(
$utheses_id
,
$doi
,
$xml_md5
,
$url
)
=
map
{
$row
->
{
$_
}
}
qw(utheses_id doi xml_md5 persistent_link)
;
my
$dc_record
=
$dc_col
->
find_one
(
{
doi
=>
$doi
}
);
# prepare registration data
my
%reg_info
=
(
doi
=>
$row
->
{
doi
},
url
=>
$url
,
context
=>
'
utheses
',
registry
=>
$dc_cnf_name
,
utheses_id
=>
$utheses_id
,
ac_number
=>
$row
->
{
ac_number
},
xml_md5
=>
$xml_md5
,
ts_epoch
=>
time
(),
ts_iso_gmt
=>
Util::ts::
ts_ISO_gmt
(),
);
# check existing datacite registration record, if it exists
if
(
defined
(
$dc_record
))
{
print
__LINE__
,
"
datacite record found:
",
Dumper
(
$dc_record
);
# check if record matches with new data
my
@cmp_fields
=
qw(registry context url utheses_id ac_number)
;
my
@problems
=
();
foreach
my
$f
(
@cmp_fields
)
{
if
(
exists
(
$dc_record
->
{
$f
})
&&
defined
(
$dc_record
->
{
$f
}))
{
if
(
$reg_info
{
$f
}
ne
$dc_record
->
{
$f
})
{
push
(
@problems
,
{
problem
=>
'
missmatch
',
field
=>
$f
,
recorded
=>
$dc_record
->
{
$f
},
found
=>
$reg_info
{
$f
}
});
}
# else: record matches, that's ok
}
else
{
push
(
@problems
,
{
problem
=>
'
missing
',
field
=>
$f
});
}
}
if
(
@problems
)
{
report_problem
(
{
area
=>
'
datacite
',
problem
=>
'
datacite_data_missmatch
',
utheses_id
=>
$utheses_id
,
doi
=>
$doi
,
problems
=>
\
@problems
}
);
push
(
@actions
,
'
problem report datacite
');
return
(
@actions
);
}
if
(
$dc_record
->
{
xml_md5
}
eq
$xml_md5
&&
$dc_record
->
{
reg_status
}
==
1
)
{
print
__LINE__
,
"
datacite metadata unchanged; not updating!
\n
";
push
(
@actions
,
'
datacite doi metadata unchanged
');
return
@actions
;
}
}
(
$ok
,
$verdict
,
$msg1
,
$msg2
)
=
register_doi_with_DataCite_for_utheses
(
$row
,
$xml
);
print
__LINE__
,
"
ok=[
$ok
] msg1=[
$msg1
] msg2=[
$msg2
] verdict=[
$verdict
]
\n
";
push
(
@actions
,
((
$ok
)
?
'
datacite doi registration ok
'
:
'
datacite doi registration failed
'),
$verdict
);
$reg_info
{
reg_status
}
=
$ok
;
$reg_info
{
reg_verdict
}
=
$verdict
,
$reg_info
{
reg_message1
}
=
$msg1
;
$reg_info
{
reg_message2
}
=
$msg2
;
print
__LINE__
,
"
insert into datacite; reg_info:
",
Dumper
(
\
%reg_info
);
my
$res
=
$dc_col
->
update
(
{
doi
=>
$doi
},
\
%reg_info
,
{
upsert
=>
1
}
);
print
__LINE__
,
"
insert res:
",
Dumper
(
$res
);
unless
(
$ok
)
{
report_problem
(
{
area
=>
'
datacite
',
problem
=>
'
registration failure
',
utheses_id
=>
$utheses_id
,
doi
=>
$doi
,
reg_info
=>
\
%reg_info
}
);
}
}
@actions
;
}
sub
process_ac_number
{
my
$ac_number
=
shift
;
my
$row
=
{};
my
@actions
=
analyze_marc_record
(
$row
,
$ac_number
);
if
(
exists
(
$row
->
{
val_utheses
}))
{
my
$ut_link
=
$row
->
{
val_utheses
};
if
(
$ut_link
=~
m#https?://utheses.univie.ac.at/detail/(\d+)[\#/]?#
)
{
my
$utheses_id
=
$
1
;
my
(
$error_code
,
$status
,
$xml
)
=
get_utheses_metadata
(
$row
,
$utheses_id
);
}
else
{
# TODO: report bad link in Alma
}
}
print
__LINE__
,
"
process_ac_number: ac_number=[
$ac_number
] row:
",
Dumper
(
$row
);
}
sub
analyze_marc_record
{
my
$row
=
shift
;
my
$ac_number
=
shift
||
$row
->
{
ac_number
};
return
(
$row
->
{
marc_record
}
=
'
no_marc_db
')
unless
(
defined
(
$marc_db
));
return
(
$row
->
{
marc_record
}
=
'
invalid_ac_number
')
unless
(
$ac_number
=~
m#^AC\d{8}$#
);
my
@actions
=
();
my
$marc_col
=
$marc_db
->
get_collection
('
alma.marc
');
my
$marc_rec
=
$marc_col
->
find_one
({
ac_number
=>
$ac_number
});
print
__LINE__
,
"
marc_rec:
",
Dumper
(
$marc_rec
);
my
$request_marc_rec
=
0
;
if
(
defined
(
$marc_rec
))
{
my
$marc_fetched
=
$marc_rec
->
{
fetched
};
my
$best_before
=
$marc_fetched
+
$MAX_MARC_AGE
;
my
$now
=
time
();
print
__LINE__
,
"
marc_fetched=[
$marc_fetched
] best_before=[
$best_before
] now=[
$now
]
\n
";
if
(
$best_before
>
$now
)
{
$row
->
{
marc_record
}
=
'
ok
';
}
else
{
print
__LINE__
,
"
marc_record too old
\n
";
$row
->
{
marc_record
}
=
'
too_old
';
$request_marc_rec
=
1
;
}
# check marc record, even when it is too old
$row
->
{
marc
}
=
my
$x
=
{};
$mex
->
extract_identifiers
(
$marc_rec
,
$x
);
push
(
@actions
,
'
marc_rec_checked
');
}
else
{
$row
->
{
marc_record
}
=
'
not_found
';
$request_marc_rec
=
1
;
}
if
(
$request_marc_rec
)
{
my
$req_col
=
$marc_db
->
get_collection
('
requests
');
my
$req
=
{
agent
=>
'
alma_cat
',
status
=>
'
new
',
action
=>
'
update_alma_2xml
',
ac_number
=>
$ac_number
,
requested_by
=>
$agent_name
,
};
$req_col
->
insert
(
$req
);
push
(
@actions
,
'
marc_rec_requested
');
print
__LINE__
,
"
analyze_marc_record: insert requests:
",
Dumper
(
$req
);
}
@actions
;
}
sub
register_doi_with_DataCite_for_utheses
{
my
$row
=
shift
;
my
$xml
=
shift
;
my
(
$rejected
,
$accepted
);
my
$ok
=
0
;
unless
(
defined
(
$row
))
{
print
"
ATTN:
can't register DOI: no utheses data found
\n
";
$rejected
=
"
can't register DOI: no utheses data found
";
goto
END
;
}
unless
(
defined
(
$xml
))
{
print
"
ATTN: can't register DOI: no xml data generated
\n
";
$rejected
=
"
can't register DOI: no xml data generated
";
goto
END
;
}
my
(
$doi
,
$url
,
$errors
,
$pol
)
=
map
{
$row
->
{
$_
}
}
qw(doi persistent_link datacite_conversion_errors policies)
;
my
(
$ftl
,
$lks
)
=
map
{
$pol
->
{
$_
}
}
qw(fulltext_locked lock_status)
;
if
(
!
defined
(
$ftl
)
||
$ftl
)
{
$rejected
=
"
can't register DOI: fulltext locked (
$ftl
)
";
goto
END
;
}
my
(
$doi
,
$url
,
$errors
,
$ftl
)
=
map
{
$row
->
{
$_
}
}
qw(doi persistent_link datacite_conversion_errors fulltext_locked)
;
if
(
$ftl
)
if
(
!
defined
(
$lks
)
||
$lks
)
{
print
"
ATTN:
can't register DOI:
fulltext locked
\n
";
$rejected
=
"
can't register DOI:
lock_status=[
$lks
]
";
goto
END
;
}
if
(
@$errors
)
{
print
"
ATTN:
can't register DOI
$doi
due to errors:
",
join
('
,
',
@$errors
)
,
"
\n
"
;
$rejected
=
"
can't register DOI
$doi
due to errors:
",
join
('
,
',
@$errors
);
goto
END
;
}
if
(
$reg_obj
->
register_doi
(
$doi
,
$xml
,
$url
))
my
(
$reg_res
,
$reg_msg1
,
$reg_msg2
)
=
$reg_obj
->
register_doi
(
$doi
,
$xml
,
$url
);
if
(
$reg_res
)
{
print
"
NOTE: register_doi doi=[
$doi
] url=[
$url
] OK
\n
";
$accepted
=
"
register_doi doi=[
$doi
] url=[
$url
] OK
";
$ok
=
1
;
}
else
{
print
"
ATTN: register_doi doi=[
$doi
] url=[
$url
] was not ok
\n
";
}
$rejected
=
"
register_doi doi=[
$doi
] url=[
$url
] was not ok
";
}
END
:
return
;
if
(
$rejected
)
{
print
"
REJECTED:
",
$rejected
,
"
\n
";
}
else
{
print
"
ACCEPTED:
",
$accepted
,
"
\n
";
}
return
(
$ok
,
(
$rejected
||
$accepted
),
$reg_msg1
,
$reg_msg2
);
}
sub
fetch_metadata_bulk
...
...
@@ -142,54 +535,90 @@ sub fetch_metadata_bulk
{
next
if
(
$utheses_id
eq
'
utheses_id
');
# CSV column name...
my
(
$row
,
$xml
)
=
get_utheses_metadata
(
$utheses_id
);
my
$row
;
my
(
$error_code
,
$status
,
$xml
)
=
get_utheses_metadata
(
$row
,
$utheses_id
);
if
(
defined
(
$row
))
{
print
TSV
join
("
\t
",
map
{
$row
->
{
$_
}
}
@TSV_COLUMNS
),
"
\n
";
}
}
}
=head2 my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id)
return DataCite_XML document for a given utheses_id and fill in $row
with important information
=cut
sub
get_utheses_metadata
{
my
$row
=
shift
;
my
$utheses_id
=
shift
;
print
__LINE__
,
"
utheses_id=[
$utheses_id
]
\n
";
my
$info
=
$utapi
->
getContainerPublicMetadata
(
$utheses_id
);
my
$info
=
$ut
_
api
->
getContainerPublicMetadata
(
$utheses_id
);
print
__LINE__
,
"
info:
",
Dumper
(
$info
);
my
(
$row
,
$xml
);
if
(
$info
->
{
status
}
eq
'
200
')
my
$xml
;
my
$status
=
$info
->
{
status
};
my
$error_code
=
'
unknown
';
if
(
$status
eq
'
200
')
{
(
$row
,
$xml
)
=
utheses2datacite
(
$info
,
$utheses_id
);
$xml
=
utheses2datacite
(
$row
,
$info
,
$utheses_id
);
# print __LINE__, " row: ", Dumper($row);
if
(
defined
(
$row
->
{
doi
}))
{
if
(
defined
(
$xml
)
&&
defined
(
my
$xml_fnm
=
$row
->
{
xml_fnm
}))
{
$row
->
{
xml_fnm
}
=
my
$xml_fnm
=
'
utheses/DataCite_XML/
'
.
$row
->
{
doi
}
.
'
.xml
';
# print __LINE__, " DataCite_XML=[$xml_fnm] xml=[$xml]\n";
if
(
defined
(
$xml
)
&&
defined
(
$row
->
{
xml_fnm
})
&&
!
@
{
$row
->
{
datacite_conversion_errors
}})
{
open
(
XML
,
'
>:utf8
',
$row
->
{
xml_fnm
})
or
die
;
# TODO: or do something else...
open
(
XML
,
'
>:utf8
',
$xml_fnm
)
or
die
"
can't write to
$xml_fnm
";
# TODO: or do something else...
print
XML
$xml
;
close
(
XML
);
my
$md5
=
file_md5_hex
(
$xml_fnm
);
print
__LINE__
,
"
xml_fnm=[
$xml_fnm
] md5=[
$md5
]
\n
";
$row
->
{
xml_md5
}
=
$md5
;
}
if
(
$row
->
{
datacite_conversion_error_count
}
==
0
)
{
$error_code
=
'
ok
';
}
else
{
printf
ERRORS
("
utheses_id=%d datacite conversion errors
\n
",
$utheses_id
);
$error_code
=
'
datacite_conversion_errors
';
}
}
else
{
printf
ERRORS
("
utheses_id=%d status=%s
\n
",
$utheses_id
,
$info
->
{
status
});
printf
ERRORS
("
utheses_id=%d no doi defined
\n
",
$utheses_id
);
$error_code
=
'
no_doi_defined
';
}
}
else
{
printf
ERRORS
("
utheses_id=%d bad status=%s
\n
",
$utheses_id
,
$status
);
$error_code
=
'
bad_status
';
}
return
(
$row
,
$xml
);
$row
->
{
error_code
}
=
$error_code
;
return
(
$error_code
,
$status
,
$xml
);
}
=head2 my $xml= utheses2datacite ($row, $info, $utheses_id)
Transcribe utheses $info for a given utheses_id into xml and return that.
$row is filled with information extracted from $info.
=cut
sub
utheses2datacite
{
my
$row
=
shift
;
my
$info
=
shift
;
my
$utheses_id
=
shift
;
...
...
@@ -204,6 +633,9 @@ sub utheses2datacite
if
(
$doi
=~
m#^10.25365/(thesis\.\d+)$#
)
{
$suffix
=
$
1
;
# rewrite DOI, if necessary, e.g. for the test environment
$doi
=
join
('
/
',
$ut_cnf
->
{
use_prefix
},
$suffix
)
if
(
exists
(
$ut_cnf
->
{
use_prefix
}));
# $doi= join ('/', '10.493943', $suffix); # TEST: fake prefix
}
else
{
...
...
@@ -215,7 +647,6 @@ sub utheses2datacite
push
(
@datacite_conversion_errors
,
"
bad_url=[
$persistent_link
]
")
unless
(
$persistent_link
=~
m#^https://utheses.univie.ac.at/detail/\d+/?$#
);
my
(
$titles
,
$abstracts
,
$publication_date
,
$langs
,
$policies
)
=
map
{
$th
->
{
$_
}
}
qw(titles abstracts publication_date languages policies)
;
my
$fulltext_locked
=
$policies
->
{
fulltext_locked
};
my
$publication_year
;
if
(
$publication_date
=~
m#^(\d{4})-?#
)
...
...
@@ -249,6 +680,8 @@ sub utheses2datacite
<
creators
>
EOX
# $xml .= " <urxn>bla</urxn>\n"; # TEST: check what happens with invalid XML
foreach
my
$author
(
@$authors
)
{
$xml
.=
<<
"
EOX
";
...
...
@@ -317,23 +750,24 @@ EOX
</
resource
>
EOX
my
%row
=
(
utheses_id
=>
$utheses_id
,
suffix
=>
$suffix
,
doi
=>
$doi
,
nbn
=>
$nbn
,
ac_number
=>
$ac_number
,
langs
=>
join
('
,
',
@$langs
),
language
=>
$main_language
,
persistent_link
=>
$persistent_link
,
datacite_conversion_errors
=>
\
@datacite_conversion_errors
,
fulltext_locked
=>
$fulltext_locked
,
phaidra
=>
$phaidra
,
policies
=>
$policies
,
);
$row
->
{
utheses_fetched
}
=
time
();
$row
->
{
utheses_id
}
=
$utheses_id
;
$row
->
{
suffix
}
=
$suffix
;
$row
->
{
doi
}
=
$doi
;
$row
->
{
nbn
}
=
$nbn
;
$row
->
{
ac_number
}
=
$ac_number
;
$row
->
{
langs
}
=
join
('
,
',
@$langs
);
$row
->
{
language
}
=
$main_language
;
$row
->
{
persistent_link
}
=
$persistent_link
;
$row
->
{
datacite_conversion_errors
}
=
\
@datacite_conversion_errors
;
$row
->
{
datacite_conversion_error_count
}
=
scalar
@datacite_conversion_errors
;
$row
->
{
phaidra
}
=
$phaidra
;
# ... mapped, see below
$row
->
{
policies
}
=
$policies
;
# ... mapped, see below
# foreach my $an (qw(fulltext_locked lock_status lock_until_date)) { $row->{$an}= $policies->{$an} }
# foreach my $an (qw(container_pid container_status thesis_doc_pid thesis_doc_status)) { $row->{$an}= $phaidra->{$an} }
(
\
%row
,
$xml
)
;
$xml
;
}
sub
xml_escape
...
...
@@ -347,3 +781,47 @@ sub xml_escape
$s
;
}
sub
report_problem
{
my
$problem
=
shift
;
my
$ut_col
=
$agent_db
->
get_collection
('
problems
');
$problem
->
{
ts_iso_gmt
}
=
Util::ts::
ts_ISO_gmt
();
my
$area
=
$problem
->
{
area
};
my
$check_id
;
if
(
$area
eq
'
utheses
')
{
$check_id
=
'
utheses_id
'
}
elsif
(
$area
eq
'
marc
')
{
$check_id
=
'
ac_number
'
}
elsif
(
$area
eq
'
datacite
')
{
$check_id
=
'
doi
'
}
$ut_col
->
update
(
{
area
=>
$area
,
$check_id
=>
my
$id
=
$problem
->
{
$check_id
}
},
$problem
,
{
upsert
=>
1
}
);
# replace problem report, if one already exists
print
__LINE__
,
"
ATTN: problem reported for
$check_id
=
$id
:
",
Dumper
(
$problem
);
}
sub
remove_problem
{
my
$area
=
shift
;
my
$id
=
shift
;
my
$check_id
=
shift
;
my
$ut_col
=
$agent_db
->
get_collection
('
problems
');
unless
(
defined
(
$check_id
))
{
if
(
$area
eq
'
utheses
')
{
$check_id
=
'
utheses_id
'
}
elsif
(
$area
eq
'
marc
')
{
$check_id
=
'
ac_number
'
}
elsif
(
$area
eq
'
datacite
')
{
$check_id
=
'
doi
'
}
}
$ut_col
->
remove
({
area
=>
$area
,
$check_id
=>
$id
});
}
__END__
=head1 TODO
* handle signals like SIGINT
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment