Skip to content

Commit 2f90553

Browse files
committed
Revert "Check on translation. Has issues with language detection"
This reverts commit 68d46d5. Conflicts: dateparser/conf.py dateparser/languages/loader.py
1 parent 3e5ce0e commit 2f90553

File tree

4 files changed

+33
-50
lines changed

4 files changed

+33
-50
lines changed

data/languages.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
base:
2-
skip: [" ", ".", ",", ";", "-", "/", "'", "|", "@", "[", "]", ",",]
2+
skip: [" ", ".", ",", ";", "-", "/", "'", "|", "@", "[", "]", ",", "t"]
33

44

55
en:

dateparser/conf.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ class Settings(object):
3838
PREFER_DATES_FROM = 'current_period' # past, future, current_period
3939
SUPPORT_BEFORE_COMMON_ERA = False
4040
PREFER_DAY_OF_MONTH = 'current' # current, first, last
41-
SKIP_TOKENS = ['t']
4241

4342
def __init__(self, **kwargs):
4443
for key in kwargs:

dateparser/languages/language.py

Lines changed: 31 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,12 @@
99

1010
from .dictionary import Dictionary, ALWAYS_KEEP_TOKENS
1111
from .validation import LanguageValidator
12-
from ..conf import settings
1312

1413

1514
class Language(object):
1615
_dictionary = None
1716
_splitters = None
1817
_wordchars = None
19-
_cached = None
2018

2119
def __init__(self, shortname, language_info):
2220
self.shortname = shortname
@@ -26,51 +24,40 @@ def __init__(self, shortname, language_info):
2624
if isinstance(value, int):
2725
simplification[key] = str(value)
2826

29-
self._cached = self
30-
3127
def validate_info(self, validator=None):
3228
if validator is None:
3329
validator = LanguageValidator
3430

35-
return validator.validate_info(language_id=self.shortname, info=self._cached.info)
31+
return validator.validate_info(language_id=self.shortname, info=self.info)
3632

3733
def is_applicable(self, date_string, strip_timezone=False):
3834
if strip_timezone:
3935
date_string, timezone = pop_tz_offset_from_string(date_string, as_offset=False)
4036

41-
date_string = self._cached._simplify(date_string)
42-
tokens = self._cached._split(date_string, keep_formatting=False)
43-
if self._cached._is_date_consists_of_digits_only(tokens):
37+
date_string = self._simplify(date_string)
38+
tokens = self._split(date_string, keep_formatting=False)
39+
if self._is_date_consists_of_digits_only(tokens):
4440
return True
4541
else:
46-
return self._cached._are_all_words_in_the_dictionary(tokens)
42+
return self._are_all_words_in_the_dictionary(tokens)
4743

4844
def translate(self, date_string, keep_formatting=False):
49-
tokens = self._cached._get_new_skip_tokens(settings.SKIP_TOKENS)
50-
51-
if tokens:
52-
self._cached.info['skip'] += tokens
53-
self._cached = Language(self._language_object.shortname, self._language_object.info)
54-
55-
date_string = self._cached._simplify(date_string)
56-
words = self._cached._split(date_string, keep_formatting)
45+
date_string = self._simplify(date_string)
46+
words = self._split(date_string, keep_formatting)
5747

58-
dictionary = self._cached._get_dictionary()
48+
dictionary = self._get_dictionary()
5949
for i, word in enumerate(words):
6050
word = word.lower()
6151
if word in dictionary:
6252
words[i] = dictionary[word] or ''
6353

64-
return self._cached._join(filter(bool, words), separator="" if keep_formatting else " ")
65-
66-
def _get_new_skip_tokens(self, tokens):
67-
return [token for token in tokens if token not in self._cached.info.get('skip', [])]
54+
return self._join(filter(bool, words), separator="" if keep_formatting else " ")
6855

6956
def _simplify(self, date_string):
7057
date_string = date_string.lower()
71-
for simplification in self._cached.info.get('simplifications', []):
58+
for simplification in self.info.get('simplifications', []):
7259
pattern, replacement = simplification.items()[0]
73-
if not self._cached.info.get('no_word_spacing', False):
60+
if not self.info.get('no_word_spacing', False):
7461
replacement = wrap_replacement_for_regex(replacement, pattern)
7562
pattern = ur'(\A|\d|_|\W)%s(\d|_|\W|\Z)' % pattern
7663
date_string = re.sub(pattern, replacement, date_string, flags=re.IGNORECASE | re.UNICODE).lower()
@@ -84,7 +71,7 @@ def _is_date_consists_of_digits_only(self, tokens):
8471
return True
8572

8673
def _are_all_words_in_the_dictionary(self, words):
87-
dictionary = self._cached._get_dictionary()
74+
dictionary = self._get_dictionary()
8875
for word in words:
8976
word = word.lower()
9077
if word.isdigit() or word in dictionary:
@@ -96,8 +83,8 @@ def _are_all_words_in_the_dictionary(self, words):
9683

9784
def _split(self, date_string, keep_formatting):
9885
tokens = [date_string]
99-
tokens = self._cached._split_tokens_with_regex(tokens, "(\d+)")
100-
tokens = self._cached._split_tokens_by_known_words(tokens, keep_formatting)
86+
tokens = self._split_tokens_with_regex(tokens, "(\d+)")
87+
tokens = self._split_tokens_by_known_words(tokens, keep_formatting)
10188
return tokens
10289

10390
def _split_tokens_with_regex(self, tokens, regex):
@@ -107,7 +94,7 @@ def _split_tokens_with_regex(self, tokens, regex):
10794
return filter(bool, chain(*tokens))
10895

10996
def _split_tokens_by_known_words(self, tokens, keep_formatting):
110-
dictionary = self._cached._get_dictionary()
97+
dictionary = self._get_dictionary()
11198
for i, token in enumerate(tokens):
11299
tokens[i] = dictionary.split(token, keep_formatting)
113100
return list(chain(*tokens))
@@ -116,7 +103,7 @@ def _join(self, tokens, separator=" "):
116103
if not tokens:
117104
return ""
118105

119-
capturing_splitters = self._cached._get_splitters()['capturing']
106+
capturing_splitters = self._get_splitters()['capturing']
120107
joined = tokens[0]
121108
for i in range(1, len(tokens)):
122109
left, right = tokens[i - 1], tokens[i]
@@ -127,19 +114,19 @@ def _join(self, tokens, separator=" "):
127114
return joined
128115

129116
def _get_dictionary(self):
130-
if self._cached._dictionary is None:
131-
self._cached._generate_dictionary()
132-
return self._cached._dictionary
117+
if self._dictionary is None:
118+
self._generate_dictionary()
119+
return self._dictionary
133120

134121
def _get_wordchars(self):
135-
if self._cached._wordchars is None:
136-
self._cached._set_wordchars()
137-
return self._cached._wordchars
122+
if self._wordchars is None:
123+
self._set_wordchars()
124+
return self._wordchars
138125

139126
def _get_splitters(self):
140-
if self._cached._splitters is None:
141-
self._cached._set_splitters()
142-
return self._cached._splitters
127+
if self._splitters is None:
128+
self._set_splitters()
129+
return self._splitters
143130

144131
def _set_splitters(self):
145132
splitters = {
@@ -148,28 +135,28 @@ def _set_splitters(self):
148135
}
149136
splitters['capturing'] |= set(ALWAYS_KEEP_TOKENS)
150137

151-
wordchars = self._cached._get_wordchars()
152-
skip = set(self._cached.info.get('skip', [])) | splitters['capturing']
138+
wordchars = self._get_wordchars()
139+
skip = set(self.info.get('skip', [])) | splitters['capturing']
153140
for token in skip:
154141
if not re.match('^\W+$', token, re.UNICODE):
155142
continue
156143
if token in wordchars:
157144
splitters['wordchars'].add(token)
158145

159-
self._cached._splitters = splitters
146+
self._splitters = splitters
160147

161148
def _set_wordchars(self):
162149
wordchars = set()
163-
for word in self._cached._get_dictionary():
150+
for word in self._get_dictionary():
164151
if re.match('^[\W\d_]+$', word, re.UNICODE):
165152
continue
166153
for char in word:
167154
wordchars.add(char.lower())
168155

169-
self._cached._wordchars = wordchars - {" "} | {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}
156+
self._wordchars = wordchars - {" "} | {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}
170157

171158
def _generate_dictionary(self):
172-
self._cached._dictionary = Dictionary(self.info)
159+
self._dictionary = Dictionary(self.info)
173160

174161
def to_parserinfo(self, base_cls=parser.parserinfo):
175162
attributes = {

dateparser/languages/loader.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from yaml import load as load_yaml
55

66
from .language import Language
7-
from ..conf import settings
87

98

109
class LanguageDataLoader(object):
@@ -36,9 +35,7 @@ def _load_data(self):
3635
else:
3736
data = self.file.read()
3837
data = load_yaml(data)
39-
base_data = data.pop('base', {'skip': []})
40-
# also add any skip tokens from custom settings
41-
base_data['skip'] += settings.SKIP_TOKENS
38+
base_data = data.pop('base', {})
4239
known_languages = {}
4340
for shortname, language_info in data.iteritems():
4441
self._update_language_info_with_base_info(language_info, base_data)

0 commit comments

Comments
 (0)