1#!/usr/bin/env python3
2
3"""Test strings to see if they appear to be English or French.
4"""
5
6
7
8import re
9import string # (depreciated module) pylint: disable=W0402
10import subprocess
11
12
13def tokens(sentence):
14 """Split `sentence` into tokens using the same algorithm as the aspell tokenizer.
15 """
16
17 # print sentence, type(sentence)
18 # words = re.split('\W+', sentence, re.UNICODE)
19 regex = re.compile(r'[%s\s]+' % re.escape(string.punctuation))
20 words = regex.split(sentence)
21 res = []
22 for word in words:
23 if any(c.isdigit() for c in word):
24 for nondigit in re.split(r'\d+', word):
25 if len(nondigit) > 0:
26 res.append(nondigit)
27
28 elif len(word) > 0:
29 res.append(word)
30
31 return res
32
33bonus_words = {'fr': {
34 'depointage': 2,
35 'estimee': 3,
36 'coh\xe9rent': 1,
37 'etat': 5,
38 'dans': 3,
39 'sous': 2,
40 'le': 0.1,
41 'alarme': 1,
42 'survie': 1,
43 'relais': 1,
44 'coupleur': 1,
45 'coupleurs': 1,
46 'm\xe9moire': 2,
47 'pour': 2,
48 'memoire': 2,
49 'interne': 3,
50 'nom': 4,
51 'electronique': 1,
52 'pointeur': 2,
53 'connexion': 1,
54 'tuyere': 5,
55 'rechauffage': 2,
56 'capteur': 2,
57 'destock\xe9': 2,
58 'alim': 1,
59 'ecart': 3,
60 'longueur': 2,
61 'une': 2,
62 'terre': 2,
63 'masquage': 2,
64 'logiciel': 2,
65 'chaine': 4,
66 'courant': 1,
67 'tuy\xe8re': 1,
68 'tuyeres': 1,
69 'chauf': 2,
70 }}
71
72
73def aspell(sentence, dictionary, debug=False):
74 """Call aspell as a subprocess, passing in `sentence`, using language `dictionary`,
75 and return a numerical value indicating the correlation between the input sentence
76 and the requested language.
77 """
78
79 words = tokens(sentence)
80
81 # print 'aspell ', sentence, ' lang ', dictionary, ' tokens ', words
82
83 # note, we have this awful hack so it runs on the AIX TCE boxes which don't have french
84 # dictionaries installed system wide. Maybe this could be improved by installing a local
85 # aspell with en and fr dictionaries built in
86 # On the IPR machines this could be removed.
87 if dictionary == 'en':
88 child = subprocess.Popen(['aspell', '-d', dictionary, '--ignore-accents', '-a'],
89 stdin=subprocess.PIPE,
90 stdout=subprocess.PIPE)
91 else:
92 child = subprocess.Popen(['aspell', '-d', dictionary, '--ignore-accents', '-a',
93 '--dict-dir',
94 '/homespace/elson/AIX/share/myaspell/aspell-fr-0.50-3'],
95 stdin=subprocess.PIPE,
96 stdout=subprocess.PIPE)
97
98 out, _ = child.communicate((' '.join(words)).encode('utf8'))
99 score = 0.0
100 if debug:
101 explain = []
102
103 word_iter = iter(words)
104
105 for line in out.split('\n'):
106 # print 'LINE x'+line+'x'
107 if line.startswith('@') or len(line) == 0:
108 continue
109
110 try:
111 word = next(word_iter)
112 except StopIteration:
113 word = ''
114
115 bonus = 0
116 if line.startswith('*'):
117 # print 'WORD ', word, ' LINE ', line, ' MATCH'
118 if len(word) == 1:
119 bonus = 0.25
120 elif len(word) == 2:
121 bonus = 0.5
122 else:
123 bonus = 1 + len(word) / 10
124
125 if dictionary in bonus_words:
126 bonus += bonus_words[dictionary].get(word.lower(), 0)
127
128 if debug:
129 explain.append((word, bonus))
130
131 score += bonus
132
133 # elif debug:
134 # print 'WORD ', word, ' LINE ', line, ' NOMATCH'
135 # pos = line.find(' ', 2)
136 # explain.append((line[2:pos], False)) # line[pos+1:].split()))
137
138 if debug:
139 return score, explain
140 else:
141 return score
142
143
144def guess_lang(sentence, debug=False):
145 """Return 'en' or 'fr' based on which language `sentence` appears to be written in.
146 """
147
148 dicts = ['en', 'fr']
149
150 best_name = None
151 best_count = None
152 if debug:
153 explains = {}
154 counts = {}
155
156 for d in dicts:
157 if debug:
158 count, explain = aspell(sentence, d, debug)
159
160 explains[d] = explain
161 counts[d] = count
162 else:
163 try:
164 count = aspell(sentence, d)
165 except OSError as e:
166 if e.errno == 2: # no such file of directory
167 return 'en' # default value if aspell is not found
168
169 raise
170
171 if best_count is None or count > best_count:
172 best_count = count
173 best_name = d
174
175 if debug:
176 return best_name, explains, counts
177 else:
178 return best_name