From 285c5c08ddea02f7de6ddb2638038472150d49ea Mon Sep 17 00:00:00 2001 From: Gerhard Gonter <ggonter@gmail.com> Date: Thu, 28 May 2020 16:11:45 +0200 Subject: [PATCH] script to guess the language of text files generated from othes pdfs --- guesslang.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100755 guesslang.py diff --git a/guesslang.py b/guesslang.py new file mode 100755 index 0000000..6b8f571 --- /dev/null +++ b/guesslang.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 + +from guess_language import guess_language +from argparse import ArgumentParser, FileType +import sys + +__authors__ = ["GG"] +__date__ = 20191031 +__description__ = 'A simple pdf example' + +def guesslang(fnm): + + fin= open(fnm, 'rt') + text= fin.read() + fin.close() + + pages= text.split("\f") + + # Note: there should not be anything behind the last Form Feed character + lastpage= pages.pop() + # print ('--- [lastpage] ----------------------------------------\n', lastpage) + lastlang= guess_language(lastpage) + + page_num= 1 + for page in pages: + # print ('--- [page ', page_num, '] ----------------------------------------\n', page) + lang= guess_language(page) + print(lang, ' ', page_num, ' ', fnm) + page_num= page_num+1 + + if (lastlang != 'UNKNOWN'): + print(lastlang, ' ', page_num+1, ' ', fnm) + +argc= len(sys.argv) +# print(argc) +if argc > 1: + for i in range(1,argc): + guesslang(sys.argv[i]) +else: + text= sys.stdin.read() + lang= guess_language(text) + print(lang, ' ', 'stdin') + -- GitLab