Skip to content
Snippets Groups Projects
Commit 285c5c08 authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

script to guess the language of text files generated from othes pdfs

parent 4bcfbb0c
Branches
No related tags found
No related merge requests found
#!/usr/bin/python3
from guess_language import guess_language
from argparse import ArgumentParser, FileType
import sys
__authors__ = ["GG"]
__date__ = 20191031
__description__ = 'A simple pdf example'
def guesslang(fnm):
fin= open(fnm, 'rt')
text= fin.read()
fin.close()
pages= text.split("\f")
# Note: there should not be anything behind the last Form Feed character
lastpage= pages.pop()
# print ('--- [lastpage] ----------------------------------------\n', lastpage)
lastlang= guess_language(lastpage)
page_num= 1
for page in pages:
# print ('--- [page ', page_num, '] ----------------------------------------\n', page)
lang= guess_language(page)
print(lang, ' ', page_num, ' ', fnm)
page_num= page_num+1
if (lastlang != 'UNKNOWN'):
print(lastlang, ' ', page_num+1, ' ', fnm)
argc= len(sys.argv)
# print(argc)
if argc > 1:
for i in range(1,argc):
guesslang(sys.argv[i])
else:
text= sys.stdin.read()
lang= guess_language(text)
print(lang, ' ', 'stdin')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment