From a4b37a8ed0ea02dd0e6bddeba467f415998ebda7 Mon Sep 17 00:00:00 2001 From: Gerhard Gonter <ggonter@gmail.com> Date: Thu, 2 Jul 2020 19:01:27 +0200 Subject: [PATCH] * added option --debug_filenames (incomplete) * privision to change filenames during upload (not used after all) --- eprints1.pl | 80 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 67 insertions(+), 13 deletions(-) diff --git a/eprints1.pl b/eprints1.pl index c13555e..8ea9b7b 100755 --- a/eprints1.pl +++ b/eprints1.pl @@ -31,6 +31,7 @@ a.k.a. NBNs, as a proxy of attached applications like "eprints". --debug_classifications --DC --debug_stkz --debug_stbez + --debug_filenames =head2 databases access @@ -63,13 +64,13 @@ use JSON -convert_blessed_universally; use Util::ts; use Util::JSON; use Util::Matrix; -use Util::ts; use Redmine::DB::MySQL; use Phaidra::Utils::iso639; use Data::Dumper; $Data::Dumper::Indent= 1; +$Data::Dumper::Sortkeys= 1; use FileHandle; @@ -208,10 +209,11 @@ while (defined ($arg= shift (@ARGV))) elsif ($opt eq 'debug') { $debug_level= $val || 1; } elsif ($opt eq 'debug_names') { $op_mode= 'debug_names'; } elsif ($opt eq 'debug_classifications' || $opt eq 'DC') { $op_mode= 'debug_classifications'; } - elsif ($opt eq 'debug_keywords') { $op_mode= 'debug_keywords'; } + elsif ($opt eq 'debug_keywords') { $op_mode= 'debug_keywords'; } elsif ($opt eq 'debug_abstracts') { $op_mode= 'debug_abstracts'; } - elsif ($opt eq 'debug_stkz') { $op_mode= 'debug_stkz'; } + elsif ($opt eq 'debug_stkz') { $op_mode= 'debug_stkz'; } elsif ($opt eq 'debug_stbez') { $op_mode= 'debug_stbez'; } + elsif ($opt eq 'debug_filenames') { $op_mode= 'debug_filenames'; } elsif ($opt eq 'max') { $MAX_SYNC= $val || shift (@ARGV); } elsif ($opt eq 'mab-age') { $MAX_MAB_AGE= $val || shift (@ARGV); } # in seconds elsif ($opt eq 'marc-age'){ $MAX_MARC_AGE= $val || shift (@ARGV); } # in seconds @@ -374,6 +376,10 @@ elsif ($op_mode eq 'debug_stbez') { debug_stbez(); } +elsif ($op_mode eq 'debug_filenames') +{ + debug_filenames(); +} elsif ($op_mode eq 'reset') # reset error conditions for given ac_numbers { reset_errors(@PARS); @@ -2420,7 +2426,7 @@ sub doigen # BEGIN check language - my ($lang_pdf, $files)= analyze_files(map { $row->{$_} } qw(fileinfo dir)); + my ($lang_pdf, $files)= analyze_files(map { $row->{$_} } qw(eprintid fileinfo dir)); my $language= Phaidra::Utils::iso639::iso_639_2_to_1($row->{sprache}); my ($abstract, $abstract_eng)= map { strip_text($row->{$_}) } qw(abstract abstract_eng); @@ -2678,9 +2684,12 @@ sub ot2ut my $upload_cnf= $cnf->{$ot2ut_context}; die "no valid ot2ut context" unless (defined ($upload_cnf)); + my $main_file= $files->[0]; + my ($local_filename, $lfnm)= map { $main_file->{$_} } qw(path upl_fnm); + # TODO: use curl for now my @upload_cmd= (qw(/usr/bin/curl -X POST -v -H Content-Type:multipart/form-data -F), 'metadata=@' . $utheses_json_path, - qw(-F type=application/json -F), 'file=@' . $files->[0]->{path}, + qw(-F type=application/json -F), 'file=@' . $local_filename . ';filename=' . $lfnm, qw(-F type=application/pdf), $upload_cnf->{import_url}, '-o' . $utheses_upload_result_json_path); if (exists ($upload_cnf->{headers})) @@ -2763,6 +2772,7 @@ old format 2019-11..2020-01 error_code => 'ok', error_cnt => 0, utheses_id => $utheses_id, + uploaded_fnm => $lfnm, response_msg => $response_msg, }; @@ -2781,9 +2791,9 @@ old format 2019-11..2020-01 last unless ($running); } - my @columns= qw( eprint_id eprint_status lastmod context ts_upload td_total error_code error_cnt utheses_id container_pid container_result document_pid - document_result activate_result import_code - response_msg import_note ); + my @columns= qw( eprint_id eprint_status lastmod context ts_upload td_total error_code error_cnt utheses_id uploaded_fnm ); + +# Fields currently not available: container_pid container_result document_pid document_result activate_result import_code response_msg import_note my $fnm= sprintf('ot2ut_%s.tsv', ts_ISO()); Util::Matrix::save_hash_as_csv(\@columns, \@synced, $fnm, "\t", '', "\n", 1); @@ -2824,7 +2834,7 @@ sub generate_utheses_metadata my $history= get_history($epr_db, $eprintid); # print __LINE__, " history: ", Dumper($history); exit; - my ($lang_pdf, $files)= analyze_files(map { $row->{$_} } qw(fileinfo dir)); + my ($lang_pdf, $files)= analyze_files(map { $row->{$_} } qw(eprintid fileinfo dir)); print __LINE__, " lang_pdf=[$lang_pdf] files: ", Dumper($files); my $main_file; if (@$files) @@ -3016,6 +3026,10 @@ sub generate_utheses_metadata $thesis->{subject_classifications}= $classifications; $thesis->{number_of_pages}= "$main_file->{page_count}"; # Phaidra expects this as a string + # 2020-06-30: modify filename for files containing sensitive information, but keep the original filename + $thesis->{original_filename}= $main_file->{orig_fnm}; + $thesis->{upload_filename}= $main_file->{upl_fnm}; + $ut->public('thesis', $thesis); # Mon May 11 22:12:38 CEST 2020 asked nd about this, especially thesis_doc_added_date: @@ -3205,7 +3219,8 @@ sub get_thesis_data type => $map_ot2ut_thesis_type{$row->{thesis_type}}, policies => { - lock_status => ($row->{sperre} eq 'FALSE') ? 0 : 1, # TRUE or NULL means the object is locked + # lock_status => ($row->{sperre} eq 'FALSE') ? 0 : 1, # TRUE or NULL means the object is locked + lock_status => ($row->{sperre} eq 'TRUE') ? 1 : 0, # TRUE means the object is locked; FALSE and NULL means not locked authorisation_to_use_by_author => ($row->{einverstaendnis} eq 'TRUE') ? 1 : 0, fulltext_locked => ($row->{full_text_status} eq 'public') ? 0 : 1, # possible values for fulltext_locked: NULL, none, resticted @@ -3227,6 +3242,10 @@ sub get_thesis_data { $policies->{lock_until_date}= get_othes_timestamp($row, 'date_sperre'); # can be NULL $policies->{lock_request}= 1; + + my $ts_now= Util::ts::ts_ISO3_gmt(time()); + print __LINE__, " ts_now=[$ts_now] lock_until_date=[$policies->{lock_until_date}]\n"; + $policies->{lock_status}= 0 if ($policies->{lock_until_date} lt $ts_now); # not locked if lock_until_date is in the past } if ($row->{abstract_nicht_anzeigen} eq 'TRUE') @@ -3325,6 +3344,7 @@ sub get_names_for_role sub analyze_files { + my $eprintid= shift; my $fileinfo= shift; my $dir= shift; @@ -3341,12 +3361,25 @@ sub analyze_files my ($icon, $filepath)= split(';', $fi); my $format= 'unknown'; - $format= 'pdf' if ($icon eq '/style/images/fileicons/pdf.png'); + $format= 'pdf' if ($icon eq '/style/images/fileicons/pdf.png' || $icon eq '/style/images/fileicons/application_pdf.png'); $filepath =~ s#%([\dA-Fa-f]{2})#chr(hex($1))#ge; # filenames are URL encoded, see 19072 for an example my @filepath= split('/', $filepath); - my $fnm= pop(@filepath); + my $upl_fnm= my $fnm= pop(@filepath); + +=begin comment + +... not needed after all. + # fix upload filename here + # check if local_filename contains matr as pattern + if ($upl_fnm =~ m#^[\d\-]+_\d{7}\.pdf$#) + { + $upl_fnm= join('_', 'othes', $eprintid, int(rand(1000000))) . '.pdf'; + } + +=end comment +=cut my @fnm= split(/\./, $fnm); my $ext= pop(@fnm); @@ -3374,7 +3407,7 @@ sub analyze_files $lang{$lang}++; } - push (@files, { format => $format, path => $path_pdf, page_count => scalar @pages }); + push (@files, { format => $format, path => $path_pdf, orig_fnm => $fnm, upl_fnm => $upl_fnm, page_count => scalar @pages }); } my $max= 0; @@ -3431,6 +3464,7 @@ sub debug_classifications print __LINE__, " cl: ", Dumper($cl); } + sub debug_names { my $epr= get_eprints_db($cnf); @@ -3629,6 +3663,26 @@ sub debug_stkz } } +sub debug_filenames +{ + my $epr= get_eprints_db($cnf); + # print "epr: ", Dumper ($epr); + + my $epr_db= $epr->connect(); + # print "epr_db: ", Dumper ($epr_db); + + my @col_names= qw( eprintid fileinfo ); + $epr_db->show_query(1); + my $search_term= "eprint_status IN ('archive', 'buffer')"; + my $keys= $epr_db->get_all_x('eprint', [$search_term], join(',', @col_names)); + + foreach my $key (keys %$keys) + { + my $r= $keys->{$key}; + print __LINE__, " key=[$key] ", Dumper($r); + } +} + sub cleanup_keywords { my $s= shift; -- GitLab