Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
I
irma2
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Gerhard Gonter
irma2
Commits
3059a218
Commit
3059a218
authored
2 years ago
by
Gerhard Gonter
Browse files
Options
Downloads
Patches
Plain Diff
initial version of eod catalog checker
parent
37d62ee1
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
eod1.pl
+343
-0
343 additions, 0 deletions
eod1.pl
with
343 additions
and
0 deletions
eod1.pl
0 → 100755
+
343
−
0
View file @
3059a218
#!/usr/bin/perl
=head1 NAME
eod1.pl
=head1 DESCRIPTION
=cut
use
strict
;
use
Data::
Dumper
;
$
Data::Dumper::
Indent
=
1
;
$
Data::Dumper::
Sortkeys
=
1
;
use
FileHandle
;
use
utf8
;
binmode
(
STDIN
,
'
:utf8
'
);
binmode
(
STDOUT
,
'
:utf8
'
);
autoflush
STDOUT
1
;
binmode
(
STDERR
,
'
:utf8
'
);
autoflush
STDERR
1
;
use
Util::
ts
;
use
Util::
JSON
;
use
MongoDB
;
use
Util::
MongoDB
;
use
lib
'
lib
';
use
Alma::
MARC_Extractor
;
use
IRMA::
NA
;
my
$agent_config_file
=
'
/etc/irma/eodagent.json
';
my
@PARS
;
my
$arg
;
while
(
defined
(
$arg
=
shift
(
@ARGV
)))
{
if
(
$arg
eq
'
-
')
{
push
(
@PARS
,
'
-
');
}
elsif
(
$arg
eq
'
--
')
{
push
(
@PARS
,
@ARGV
);
@ARGV
=
();
}
elsif
(
$arg
=~
/^--(.+)/
)
{
my
(
$opt
,
$val
)
=
split
('
=
',
$
1
,
2
);
if
(
$opt
eq
'
help
')
{
usage
();
}
else
{
usage
();
}
}
elsif
(
$arg
=~
/^-(.+)/
)
{
foreach
my
$opt
(
split
('',
$
1
))
{
if
(
$opt
eq
'
h
')
{
usage
();
exit
(
0
);
}
# elsif ($opt eq 'x') { $x_flag= 1; }
else
{
usage
();
}
}
}
else
{
push
(
@PARS
,
$arg
);
}
}
print
join
('
',
__FILE__
,
__LINE__
,
'
caller=[
'
.
caller
()
.
'
]
'),
"
\n
";
my
$agent_cnf
=
Util::JSON::
read_json_file
(
$agent_config_file
);
# print __LINE__, " agent_cnf: ", main::Dumper ($agent_cnf); exit(0);
# Step 1: get list of books from inventory
# get handles for various databases
my
$inv_db
=
IRMA::db::
get_any_db
(
$agent_cnf
,
'
inventory_database
');
my
$foxml_col
=
$inv_db
->
get_collection
('
foxml.data
');
# print __LINE__, " foxml_col: ", Dumper($foxml_col);
print
__LINE__
,
"
foxml_col=[
$foxml_col
]
\n
";
my
@foxml_columns
=
qw(ownerId state model ac_number aleph_url pid)
;
# my %foxml_columns= map { $_ => 1 } @foxml_columns;
my
$search
=
{
ownerId
=>
'
ondemae7
',
state
=>
'
Active
',
model
=>
'
Book
'
};
my
%counters
;
my
$cur
=
$foxml_col
->
find
(
$search
);
# print __LINE__, ' cur: ', Dumper($cur);
my
%ac_numbers
=
();
my
%pids
=
();
my
@books
=
();
while
(
my
$rec
=
$cur
->
next
())
{
# print __LINE__, " rec: ", Dumper($rec);
my
%book
=
map
{
$_
=>
$rec
->
{
$_
}
}
@foxml_columns
;
# print __LINE__, " book: ", Dumper(\%book);
$book
{
phaidra_url
}
=
'
https://phaidra.univie.ac.at/
'
.
$book
{
pid
};
push
(
@
{
$ac_numbers
{
$book
{
ac_number
}}}
=>
\
%book
);
$pids
{
$book
{
pid
}}
=
\
%book
;
push
(
@books
,
\
%book
);
}
print
__LINE__
,
"
checking for duplicate ac_numbers
\n
";
foreach
my
$ac_number
(
keys
%ac_numbers
)
{
my
$x
=
$ac_numbers
{
$ac_number
};
# print __LINE__, " ac_number=[$ac_number] pids=[", join(', ', map { $_->{pid} } @$x), "]\n";
if
(
@$x
!=
1
)
{
# either this is a duplicate or a member of a collection (ZS) where the Alma record should point to the collection instead
print
__LINE__
,
"
duplicate_ac_number=[
$ac_number
] pids=[
",
join
('
,
',
map
{
$_
->
{
pid
}
}
@$x
),
"
]
\n
";
$counters
{
duplicate_ac_number
}
++
;
# TODO: find out, why this is a duplicate, possibly do not mark the "canonical" version which should be identified in a ticket or so
# for now, just leave them out of further processing.
foreach
my
$book
(
@$x
)
{
$book
->
{
ac_number_note
}
=
'
dup
';
}
}
}
my
$book_count
=
@books
;
print
__LINE__
,
"
found
$book_count
books
\n
";
# BEGIN step 2: check IRMA records
my
$irma_db
=
IRMA::db::
get_any_db
(
$agent_cnf
,
'
irma_database
');
my
$irma_col
=
$irma_db
->
get_collection
('
irma.map
');
# BEGIN step 3: check Alma records
my
$marc_db
=
IRMA::db::
get_any_db
(
$agent_cnf
,
'
marc_database
');
my
$marc_col
=
$marc_db
->
get_collection
('
alma.marc
');
my
$req_col
=
$marc_db
->
get_collection
('
requests
');
my
@marc_mex_fields
=
qw(df_doi val_doi df_phaidra val_phaidra df_urn val_urn ts_marc)
;
my
@marc_fields
=
qw(ac_number mms_id fetched lib_code)
;
my
@marc_extra_fields
=
qw(marc_record ts_fetched)
;
my
$mex
=
Alma::
MARC_Extractor
->
new
(
\
@marc_fields
);
$mex
->
{
mex_ot2ut
}
=
0
;
$mex
->
{
mex_phaidra
}
=
1
;
my
@books_duplicate_ac_number
;
my
@books_problems
;
my
@books_ok
;
BOOK3:
foreach
my
$book
(
@books
)
{
my
@irma_notes
=
();
my
(
$ac_number
,
$pid
)
=
map
{
$book
->
{
$_
}
}
qw(ac_number pid)
;
unless
(
defined
(
$ac_number
)
&&
$ac_number
=~
m#^AC\d{8}$#
)
{
$book
->
{
problem
}
=
"
invalid ac_number=[
$ac_number
]
";
push
(
@books_problems
,
$book
);
$book
->
{
problem
}
=
'
invalid_ac_number
';
$counters
{
invalid_ac_number
}
++
;
next
BOOK3
;
}
my
@irma_records
=
$irma_col
->
find
({
ac_number
=>
$ac_number
})
->
all
();
print
__LINE__
,
"
ac_number=[
$ac_number
] irma_records:
",
scalar
@irma_records
,
'
',
Dumper
(
\
@irma_records
);
my
$marc
=
$marc_col
->
find_one
({
ac_number
=>
$ac_number
});
# print __LINE__, " marc: ", Dumper($marc);
unless
(
defined
(
$marc
))
{
$counters
{
missing_marc_record
}
++
;
$book
->
{
problem
}
=
'
missing_marc_record
';
push
(
@books_problems
,
$book
);
next
BOOK3
;
}
if
(
$book
->
{
ac_number_note
})
{
$counters
{
has_ac_number_note
}
++
;
push
(
@books_duplicate_ac_number
,
$book
);
# save for later, see above near the duplicate check
next
BOOK3
;
}
my
@alma_notes
=
();
$book
->
{
ts_fetched
}
=
Util::ts::
ts_ISO_gmt
(
$marc
->
{
fetched
});
$mex
->
extract_identifiers2
(
$marc
,
$book
);
if
(
exists
(
$book
->
{
mex
}))
{
my
$mex
=
$book
->
{
mex
};
print
__LINE__
,
"
ac_number=[
$ac_number
] mex:
",
Dumper
(
$mex
);
foreach
my
$mf
(
keys
%$mex
)
{
my
$mfa
=
$mex
->
{
$mf
};
if
(
@$mfa
!=
1
)
{
# this should be inspected
print
__LINE__
,
"
ac_number=[
$ac_number
] multiple entries for
$mf
mex:
",
Dumper
(
$mex
);
$counters
{
multiple_mex_entries
}
++
;
$book
->
{
problems
}
=
'
multiple_mex_entries
';
push
(
@books_problems
,
$book
);
next
BOOK3
;
}
foreach
my
$mfe
(
@$mfa
)
{
my
(
$df
,
$val
)
=
@$mfe
;
my
$copy
=
0
;
if
(
$mf
eq
'
phaidra
')
{
if
(
$val
=~
m#^https?://phaidra.univie.ac.at/(.*)(o:\d+)$#
)
{
$copy
=
1
;
# only transcribe those URLs that really look like some
my
(
$extra
,
$found_pid
)
=
(
$
1
,
$
2
);
my
$correct_url
=
'
https://phaidra.univie.ac.at/
'
.
$found_pid
;
if
(
$correct_url
ne
$book
->
{
phaidra_url
}
||
$extra
ne
'')
{
$book
->
{
update_phaidra_url
}
=
$book
->
{
phaidra_url
};
$counters
{
update_phaidra_url
}
++
;
push
(
@alma_notes
,
'
update_phaidra_url
');
}
}
}
elsif
(
$mf
eq
'
hdl
')
{
if
(
$val
=~
m#^11353/10\.(\d+)$#
)
{
$copy
=
1
;
# only transcribe those URLs that really look like some
my
$found_pid
=
'
o:
'
.
$
1
;
if
(
$found_pid
ne
$pid
)
{
# TODO:
$book
->
{
update_hdl
}
=
$book
->
{
hdl
};
$counters
{
update_hdl
}
++
;
push
(
@alma_notes
,
'
update_hdl
');
}
}
}
elsif
(
$mf
eq
'
doi
')
{
if
(
$val
=~
m#^10.25365/digital-copy\.(\d+)$#
)
{
$copy
=
1
;
# only transcribe those URLs that really look like some
# TODO: DOI checken
# if ($found_pid ne $pid)
# { # TODO:
# $book->{update_hdl}= $book->{hdl};
# $counters{update_hdl}++;
# push(@alma_notes, 'update_hdl');
# }
}
}
elsif
(
$mf
eq
'
urn
')
{
if
(
$val
=~
m#^urn:nbn:at:at-ubw-\d{5}\.\d{5}\.\d{6}-\d$#
)
{
$copy
=
1
;
# only transcribe those URLs that really look like some
# TODO: check if this is the known nbn!
}
}
if
(
$copy
)
{
$book
->
{'
df_
'
.
$mf
}
=
$df
;
$book
->
{'
val_
'
.
$mf
}
=
$val
;
}
else
{
push
(
@alma_notes
,
"
junk_
${mf}
_in_marc
");
print
__LINE__
,
"
ATTN: ac_number=[
$ac_number
] pid=[
$pid
] junk data mf=[
$mf
] in marc record: df=[
$df
] val=[
$val
]
\n
";
}
}
}
}
else
{
# this is ok, not a bug, Alma simply does not know anything about this phaidra object
print
__LINE__
,
"
ac_number=[
$ac_number
] mex missing
\n
";
}
unless
(
exists
(
$book
->
{
df_phaidra
}))
{
# TODO: if there is no phaidra_url at all, set it...
$counters
{
set_phaidra_url
}
++
;
$book
->
{
update_phaidra_url
}
=
$book
->
{
phaidra_url
};
push
(
@alma_notes
,
'
set_phaidra_url
');
}
push
(
@alma_notes
,
'
ok
')
unless
(
@alma_notes
);
$book
->
{
alma_notes
}
=
join
('
,
',
@alma_notes
);
push
(
@books_ok
,
$book
);
# print __LINE__, " book: ", Dumper($book);
$counters
{
ok
}
++
;
}
# END check Alma records
print
__LINE__
,
"
counters:
",
Dumper
(
\
%counters
);
Util::JSON::
write_json_file
('
eod_problems.json
',
\
@books_problems
);
my
@tsv_columns
=
qw( pid alma_notes ownerId state model ac_number ac_number_note aleph_url
marc_record ts_fetched fetched ts_marc mms_id lib_code
df_doi val_doi
df_urn val_urn
df_hdl val_hdl update_hdl
df_phaidra val_phaidra phaidra_url update_phaidra_url
)
;
write_tsv_file
('
eod_data.tsv
',
\
@books_ok
);
write_tsv_file
('
duplicate_eod_data.tsv
',
\
@books_duplicate_ac_number
);
sub
write_tsv_file
{
my
$tsv_filename
=
shift
;
my
$book_list
=
shift
;
open
(
TSV
,
'
>:utf8
',
$tsv_filename
)
or
die
"
can't write to
$tsv_filename
";
my
$count_books_ok
=
@$book_list
;
print
"
saving
$count_books_ok
to
$tsv_filename
\n
";
print
TSV
join
("
\t
",
@tsv_columns
),
"
\n
";
BOOKx:
foreach
my
$book
(
@$book_list
)
{
print
TSV
join
("
\t
",
map
{
$book
->
{
$_
}
}
@tsv_columns
),
"
\n
";
}
close
(
TSV
);
}
__END__
=head1 EXAMPLES
=head2 AC02901724
-[ RECORD 1 ]----------------------
pid | o:90496
alma_notes | update_phaidra_url
ownerId | ondemae7
state | Active
model | Book
ac_number | AC02901724
aleph_url | http://aleph.univie.ac.at/F?func=find-c&ccl_term=AC02901724
marc_record | marc_data_found
ts_fetched | 2023-03-18T173800
fetched | 1679161080
ts_marc | 20230303140010.0
mms_id | 990013785100203332
lib_code | 43ACC_UBW
df_phaidra | 856:4: :u
val_phaidra | http://phaidra.univie.ac.at/o:90495
phaidra_url | https://phaidra.univie.ac.at/o:90496
update_phaidra_url | https://phaidra.univie.ac.at/o:90496
o:90495 is a collection, o:90496 is the only member; this is a ZS record,
so this is ok and the suggested change is not correct.
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment