Commit f2d4ab50 authored by Leal Rafael's avatar Leal Rafael
Browse files

Moved word processing to its own function

parent 01e84539
......@@ -132,8 +132,6 @@ class NaiveLemmatizer:
yield self._ConlluLine(*conllu_line)
def _extract_lemmas(self, conllu):
guess_correct = self.guess_correct_word_form
pos_tagging = self.pos_tagging
sent_id = 0
for line in self._process_conllu(conllu):
if line == '\n\n':
......@@ -144,16 +142,21 @@ class NaiveLemmatizer:
if new_sent_id > sent_id:
sent_id = new_sent_id
yield '\n'
word = line.lemma
if self.clean_stopwords and word in self._STOPWORDS:
word = self._process_word(line.lemma, line.upos)
if not word:
if guess_correct:
word = voikko_tools.fix_compound(line.lemma)
if pos_tagging:
word = f"{word}{self.pos_separator}{line.upos}"
yield word.strip()
yield word
yield '\n'
def _process_word(self, word, pos_tag):
if self.clean_stopwords and word in self._STOPWORDS:
return None
if self.guess_correct:
word = voikko_tools.fix_compound(word)
if self.pos_tagging:
word = f"{word}{self.pos_separator}{pos_tag}"
return word.strip()
def _yield_sentences(self, conllu):
sentence = []
for lemma in self._extract_lemmas(conllu):
