Skip to content
Snippets Groups Projects
Commit dcc987a2 authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

parser for static othes abstract files in index.html

parent 5ca6244f
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/perl
use strict;
use Data::Dumper;
$Data::Dumper::Indent= 1;
$Data::Dumper::Sortkeys= 1;
use FileHandle;
binmode( STDIN, ':utf8' );
binmode( STDOUT, ':utf8' ); autoflush STDOUT 1;
binmode( STDERR, ':utf8' ); autoflush STDERR 1;
use HTML::TreeBuilder 5 -weak; # Ensure weak references in use
use Util::JSON;
use FindBin;
use lib join('/', $FindBin::Bin, 'lib');
use IRMA::db;
my $config_fnm= '/etc/irma/eprints.json';
my $force= 0;
my @PARS;
my $arg;
while (defined ($arg= shift (@ARGV)))
{
if ($arg eq '--') { push (@PARS, @ARGV); @ARGV= (); }
elsif ($arg =~ /^--(.+)/)
{
my ($opt, $val)= split ('=', $1, 2);
if ($opt eq 'help') { usage(); }
elsif ($opt eq 'force') { $force= (defined ($val)) ? $val : 1; }
else { usage(); }
}
elsif ($arg =~ /^-(.+)/)
{
foreach my $opt (split ('', $1))
{
if ($opt eq 'h') { usage(); exit (0); }
elsif ($opt eq 'F') { $force= 1; }
else { usage(); }
}
}
else
{
push (@PARS, $arg);
}
}
my $cnf= Util::JSON::read_json_file ($config_fnm);
my $ot2ut= IRMA::db::get_any_db($cnf, 'ot2ut_database');
my $col_othes_policy= $ot2ut->get_collection('othes.policy');
# print __LINE__, " ot2ut=[$ot2ut]\n";
PAR: foreach my $fnm (@PARS)
{
update_file($fnm);
}
exit(0);
sub update_file
{
my $fnm= shift;
my @fnm= split('/', $fnm);
my $last= pop (@fnm);
unless ($last eq 'index.html')
{
print __LINE__, " ATTN: not an index.html: last=[$last]\n";
return undef;
}
my @last4= splice(@fnm, -4);
my $eprint_id= join('', @last4)+0;
# print __LINE__, " last4=[$last4] [",join(':', @last4), "]\n"; return undef;
my @st= stat($fnm);
return undef unless (@st);
my $mtime= $st[9];
my $search= { eprint_id => "$eprint_id" };
# print __LINE__, " search: ", Dumper($search);
my $policy_rec= $col_othes_policy->find_one($search);
# print __LINE__, " policy_rec=[$policy_rec]\n";
if (defined ($policy_rec) && $policy_rec->{mtime} == $mtime && !$force)
{
# print __LINE__, " already processed; skipping\n";
# print __LINE__, " ", Dumper($policy_rec);
return undef;
}
my $policy_info= process_index_file($fnm);
$policy_info->{mtime}= $mtime;
# delete ($policy_info->{document_paths});
print __LINE__, " policy_info: ", Dumper($policy_info);
if (defined ($policy_rec))
{
my $rc_upd= $col_othes_policy->update( { _id => $policy_rec->{_id} }, $policy_info );
print __LINE__, " policy update: rc_upd=[$rc_upd] ", Dumper($rc_upd);
}
else
{
if (exists ($policy_info->{eprint_id}))
{
my $rc_ins= $col_othes_policy->insert( $policy_info );
print __LINE__, " policy insert: rc_ins=[$rc_ins] ", Dumper($rc_ins);
}
else
{
print __LINE__, " WARNING: eprint_id missing\n";
}
}
}
sub parse_document_url
{
my $href= shift;
if ($href =~ m#https?://othes.univie.ac.at/((\d+)/(\d+)/([^/]+))$#)
{
my ($document_path, $eprint_id, $seq_num, $filename)= ($1, $2, $3, $4);
return
{
document_path => $document_path,
document_url => $href,
eprint_id => $eprint_id,
seq_num => $seq_num,
filename => $filename,
is_downloadable => 0,
};
}
undef;
}
sub process_index_file
{
my $fnm= shift;
my $tree= HTML::TreeBuilder->new; # empty tree
$tree->parse_file($fnm);
# print "Hey, here's a dump of the parse tree=[$tree] of fnm=[$fnm]:\n";
# $tree->dump; # a method we inherit from HTML::Element
# print "And here it is, bizarrely rerendered as HTML:\n", $tree->as_HTML, "\n";
=begin comment
print "\n\n", '='x70,"\n";
print "ALL HTML\n";
print '='x70,"\n";
$tree->dump();
# $tree->as_HTML();
print '='x70,"\n";
=end comment
=cut
=begin comment
my @nodes= $tree->content_list();
print __LINE__, " nodes: ", scalar @nodes, "\n";
foreach my $node (@nodes)
{
print __LINE__, " node=[$node]\n";
print $node->dump();
}
=end comment
=cut
my %result=
(
filename => $fnm,
# mtimte => $mtime,
);
my $cnt_warnings= 0;
my %document_paths= ();
# 1. check meta elements
# my @meta_elements= $tree->find_by_tag_name('meta');
my @meta_elements= $tree->look_down( _tag => 'meta' );
# print __LINE__, " meta_elements: ", scalar @meta_elements, "\n";
foreach my $meta_element (@meta_elements)
{
# print $meta_element->dump();
my $name= $meta_element->attr('name');
my $content= $meta_element->attr('content');
# print __LINE__, " meta: name=[$name] content=[$content]\n";
if ($name eq 'eprints.document_url')
{
my $res= parse_document_url($content);
if (defined ($res))
{
$document_paths{$res->{document_path}}= $res;
}
else
{
push (@{$result{unknown_document_paths_meta}}, $content);
$cnt_warnings++;
}
}
elsif ($name =~ m#eprints\.(full_text_status|ispublished|metadata_visibility)#) { my $an= $1; $result{$an}= $content; }
elsif ($name eq 'DC.relation')
{
if ($content =~ m#https?://othes.univie.ac.at/(\d+)/$#)
{
$result{eprint_id}= $1;
$result{othes_url}= $content;
}
}
}
=begin comment
my @doc_cit= $tree->look_down( _tag => 'span', class => 'ep_document_citation' );
foreach my $doc_cit (@doc_cit)
{
print __LINE__, " doc_cit:\n";
print '='x70,"\n";
print $doc_cit->dump();
# print $doc_cit->as_HTML(), "\n";
}
=end comment
=cut
my @anchors= $tree->look_down( _tag => 'a' );
my %hrefs= ();
foreach my $anchor (@anchors)
{
my $href= $anchor->attr('href');
# print __LINE__, " anchor: href=[$href]\n";
$hrefs{$href}++;
# print '='x70,"\n";
# print $anchor->dump();
# print $div1->as_HTML(), "\n";
}
# print __LINE__, " hrefs: ", Dumper(\%hrefs);
foreach my $href (keys %hrefs)
{
if ($href =~ m#https?://othes.univie.ac.at/((\d+)/(\d+)/([^/]+))$#)
{
my $res= parse_document_url($href);
if (defined ($res))
{
if (exists ($document_paths{$res->{document_path}}))
{
$document_paths{$res->{document_path}}->{is_downloadable}= 1;
}
else
{
push (@{$result{unmatched_document_paths_href}}, $href);
$cnt_warnings++;
}
}
else
{
push (@{$result{unknown_document_paths_href}}, $href);
$cnt_warnings++;
}
}
elsif ($href =~ m#https?://resolver.obvsg.at/(urn:nbn:at:.*)#)
{
$result{urn}= $1;
$result{urn_resolver_url}= $href;
}
elsif ($href =~ m#https?://ubdata.univie.ac.at/(AC\d{8})#)
{
$result{ac_number}= $1;
$result{catalog_url}= $href;
}
elsif ($href =~ m#https?://doi\.org/(.*)#)
{
$result{doi}= $1;
$result{doi_resolver_url}= $href;
}
}
# Now that we're done with it, we must destroy it.
# $tree = $tree->delete; # Not required with weak references
my ($cnt_docs, $cnt_public, $cnt_restricted)= (0, 0, 0);
my @document_paths;
foreach my $dp (keys %document_paths)
{
my $x= $document_paths{$dp};
if ($x->{is_downloadable}) { $cnt_public++; } else { $cnt_restricted++; }
$cnt_docs++;
}
$result{cnt_docs}= $cnt_docs;
$result{cnt_public}= $cnt_public;
$result{cnt_restricted}= $cnt_restricted;
$result{cnt_warnings}= $cnt_warnings;
$result{document_paths}= [ values %document_paths ];
\%result;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment