Commit e7cc2ae3 authored by Leal Rafael's avatar Leal Rafael
Browse files

Added separate_sentences as a class attribute

parent 60967971
......@@ -27,11 +27,14 @@ class NaiveLemmatizer:
Args:
output_type: "conllu", "lemma" or "csv". When saving, uses the appropriate format.
clean_stopwords: boolean indicating if the stopwords must be filtered out or not.
Defaults to False
pos_tagging: boolean indicating if POS tags should be included in the lemmatization.
Only used when lemmatizing. True by default.
pos_separator: the string used to separate lemmas from pos tags. The default is &/
guess_correct_word_form: guesses if a compound word includes a dash or not
using Veikko. Only used when lemmatizing.
using Veikko. Only used when lemmatizing. True by default.
separate_sentences: if True (default), each sentence is printed or written to a
separate line. Paragraphs are always separated by an empty line.
"""
_CLIENT = docker.from_env()
......@@ -48,6 +51,7 @@ class NaiveLemmatizer:
clean_stopwords=None,
pos_tagging=None,
guess_correct_word_form=None,
separate_sentences=None,
):
self.output_type = (
output_type if output_type in ('conllu', 'lemmas', 'csv')
......@@ -59,6 +63,9 @@ class NaiveLemmatizer:
self.guess_correct_word_form = (
guess_correct_word_form if guess_correct_word_form is not None else True
)
self.separate_sentences = (
separate_sentences if separate_sentences is not None else True
)
if not isinstance(self, DatasetLemmatizer):
self._start_docker()
......@@ -161,7 +168,9 @@ class NaiveLemmatizer:
return word.strip()
def _yield_sentences(self, conllu, separate_sentences=None, pos_tagging=None):
separate_sentences = separate_sentences if separate_sentences is not None else True
separate_sentences = (
separate_sentences if separate_sentences is not None else self.separate_sentences
)
tokens = []
for lemma in self._extract_lemmas(conllu, pos_tagging=pos_tagging):
if lemma.startswith('\n'):
......@@ -317,11 +326,13 @@ class DatasetLemmatizer(NaiveLemmatizer):
combined_doc=None,
conllu_doc=None,
dest_dataset_folder=None,
separate_sentences=None,
):
super().__init__(
output_type=output_type,
pos_tagging=pos_tagging,
clean_stopwords=clean_stopwords
clean_stopwords=clean_stopwords,
separate_sentences=separate_sentences,
)
self.dataset = dataset
if isinstance(self.dataset, str):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment