From 285c5c08ddea02f7de6ddb2638038472150d49ea Mon Sep 17 00:00:00 2001
From: Gerhard Gonter <ggonter@gmail.com>
Date: Thu, 28 May 2020 16:11:45 +0200
Subject: [PATCH] script to guess the language of text files generated from
 othes pdfs

---
 guesslang.py | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100755 guesslang.py

diff --git a/guesslang.py b/guesslang.py
new file mode 100755
index 0000000..6b8f571
--- /dev/null
+++ b/guesslang.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python3
+
+from guess_language import guess_language
+from argparse import ArgumentParser, FileType
+import sys
+
+__authors__ = ["GG"]
+__date__ = 20191031
+__description__ = 'A simple pdf example'
+
+def guesslang(fnm):
+
+  fin= open(fnm, 'rt')
+  text= fin.read()
+  fin.close()
+
+  pages= text.split("\f")
+
+  # Note: there should not be anything behind the last Form Feed character
+  lastpage= pages.pop()
+  # print ('--- [lastpage] ----------------------------------------\n', lastpage)
+  lastlang= guess_language(lastpage)
+
+  page_num= 1
+  for page in pages:
+    # print ('--- [page ', page_num, '] ----------------------------------------\n', page)
+    lang= guess_language(page)
+    print(lang, ' ', page_num, ' ', fnm)
+    page_num= page_num+1
+
+  if (lastlang != 'UNKNOWN'):
+    print(lastlang, ' ', page_num+1, ' ', fnm)
+
+argc= len(sys.argv)
+# print(argc)
+if argc > 1:
+  for i in range(1,argc):
+    guesslang(sys.argv[i])
+else:
+  text= sys.stdin.read()
+  lang= guess_language(text)
+  print(lang, ' ', 'stdin')
+
-- 
GitLab