1#!/usr/bin/env python3
  2
  3"""Test strings to see if they appear to be English or French.
  4"""
  5
  6
  7
  8import re
  9import string  # (depreciated module) pylint: disable=W0402
 10import subprocess
 11
 12
 13def tokens(sentence):
 14    """Split `sentence` into tokens using the same algorithm as the aspell tokenizer.
 15    """
 16
 17    # print sentence, type(sentence)
 18    # words = re.split('\W+', sentence, re.UNICODE)
 19    regex = re.compile(r'[%s\s]+' % re.escape(string.punctuation))
 20    words = regex.split(sentence)
 21    res = []
 22    for word in words:
 23        if any(c.isdigit() for c in word):
 24            for nondigit in re.split(r'\d+', word):
 25                if len(nondigit) > 0:
 26                    res.append(nondigit)
 27
 28        elif len(word) > 0:
 29            res.append(word)
 30
 31    return res
 32
 33bonus_words = {'fr': {
 34        'depointage': 2,
 35        'estimee': 3,
 36        'coh\xe9rent': 1,
 37        'etat': 5,
 38        'dans': 3,
 39        'sous': 2,
 40        'le': 0.1,
 41        'alarme': 1,
 42        'survie': 1,
 43        'relais': 1,
 44        'coupleur': 1,
 45        'coupleurs': 1,
 46        'm\xe9moire': 2,
 47        'pour': 2,
 48        'memoire': 2,
 49        'interne': 3,
 50        'nom': 4,
 51        'electronique': 1,
 52        'pointeur': 2,
 53        'connexion': 1,
 54        'tuyere': 5,
 55        'rechauffage': 2,
 56        'capteur': 2,
 57        'destock\xe9': 2,
 58        'alim': 1,
 59        'ecart': 3,
 60        'longueur': 2,
 61        'une': 2,
 62        'terre': 2,
 63        'masquage': 2,
 64        'logiciel': 2,
 65        'chaine': 4,
 66        'courant': 1,
 67        'tuy\xe8re': 1,
 68        'tuyeres': 1,
 69        'chauf': 2,
 70        }}
 71
 72
 73def aspell(sentence, dictionary, debug=False):
 74    """Call aspell as a subprocess, passing in `sentence`, using language `dictionary`,
 75    and return a numerical value indicating the correlation between the input sentence
 76    and the requested language.
 77    """
 78
 79    words = tokens(sentence)
 80
 81    # print 'aspell ', sentence, ' lang ', dictionary, ' tokens ', words
 82
 83    # note, we have this awful hack so it runs on the AIX TCE boxes which don't have french
 84    # dictionaries installed system wide. Maybe this could be improved by installing a local
 85    # aspell with en and fr dictionaries built in
 86    # On the IPR machines this could be removed.
 87    if dictionary == 'en':
 88        child = subprocess.Popen(['aspell', '-d', dictionary, '--ignore-accents', '-a'],
 89                                 stdin=subprocess.PIPE,
 90                                 stdout=subprocess.PIPE)
 91    else:
 92        child = subprocess.Popen(['aspell', '-d', dictionary, '--ignore-accents', '-a',
 93                                  '--dict-dir',
 94                                  '/homespace/elson/AIX/share/myaspell/aspell-fr-0.50-3'],
 95                                 stdin=subprocess.PIPE,
 96                                 stdout=subprocess.PIPE)
 97
 98    out, _ = child.communicate((' '.join(words)).encode('utf8'))
 99    score = 0.0
100    if debug:
101        explain = []
102
103    word_iter = iter(words)
104
105    for line in out.split('\n'):
106        # print 'LINE x'+line+'x'
107        if line.startswith('@') or len(line) == 0:
108            continue
109
110        try:
111            word = next(word_iter)
112        except StopIteration:
113            word = ''
114
115        bonus = 0
116        if line.startswith('*'):
117            # print 'WORD ', word, ' LINE ', line, ' MATCH'
118            if len(word) == 1:
119                bonus = 0.25
120            elif len(word) == 2:
121                bonus = 0.5
122            else:
123                bonus = 1 + len(word) / 10
124
125        if dictionary in bonus_words:
126            bonus += bonus_words[dictionary].get(word.lower(), 0)
127
128        if debug:
129            explain.append((word, bonus))
130
131        score += bonus
132
133        # elif debug:
134            # print 'WORD ', word, ' LINE ', line, ' NOMATCH'
135            # pos = line.find(' ', 2)
136            # explain.append((line[2:pos], False))  # line[pos+1:].split()))
137
138    if debug:
139        return score, explain
140    else:
141        return score
142
143
144def guess_lang(sentence, debug=False):
145    """Return 'en' or 'fr' based on which language `sentence` appears to be written in.
146    """
147
148    dicts = ['en', 'fr']
149
150    best_name = None
151    best_count = None
152    if debug:
153        explains = {}
154        counts = {}
155
156    for d in dicts:
157        if debug:
158            count, explain = aspell(sentence, d, debug)
159
160            explains[d] = explain
161            counts[d] = count
162        else:
163            try:
164                count = aspell(sentence, d)
165            except OSError as e:
166                if e.errno == 2:  # no such file of directory
167                    return 'en'  # default value if aspell is not found
168
169                raise
170
171        if best_count is None or count > best_count:
172            best_count = count
173            best_name = d
174
175    if debug:
176        return best_name, explains, counts
177    else:
178        return best_name