Revert "Check on translation. Has issues with language detection"

waqasshabbir · waqasshabbir · commit 2f90553d12b2 · 2015-06-26T00:03:04.000+05:00
This reverts commit 68d46d5. Conflicts: dateparser/conf.py dateparser/languages/loader.py
diff --git a/data/languages.yaml b/data/languages.yaml
@@ -1,5 +1,5 @@
 base:
-    skip: [" ", ".", ",", ";", "-", "/", "'", "|", "@", "[", "]", "，",]
+    skip: [" ", ".", ",", ";", "-", "/", "'", "|", "@", "[", "]", "，", "t"]
 
 
 en:
diff --git a/dateparser/conf.py b/dateparser/conf.py
@@ -38,7 +38,6 @@ class Settings(object):
     PREFER_DATES_FROM = 'current_period'  # past, future, current_period
     SUPPORT_BEFORE_COMMON_ERA = False
     PREFER_DAY_OF_MONTH = 'current'  # current, first, last
-    SKIP_TOKENS = ['t']
 
     def __init__(self, **kwargs):
         for key in kwargs:
diff --git a/dateparser/languages/language.py b/dateparser/languages/language.py
@@ -9,14 +9,12 @@
 
 from .dictionary import Dictionary, ALWAYS_KEEP_TOKENS
 from .validation import LanguageValidator
-from ..conf import settings
 
 
 class Language(object):
     _dictionary = None
     _splitters = None
     _wordchars = None
-    _cached = None
 
     def __init__(self, shortname, language_info):
         self.shortname = shortname
@@ -26,51 +24,40 @@ def __init__(self, shortname, language_info):
             if isinstance(value, int):
                 simplification[key] = str(value)
 
-        self._cached = self
-
     def validate_info(self, validator=None):
         if validator is None:
             validator = LanguageValidator
 
-        return validator.validate_info(language_id=self.shortname, info=self._cached.info)
+        return validator.validate_info(language_id=self.shortname, info=self.info)
 
     def is_applicable(self, date_string, strip_timezone=False):
         if strip_timezone:
             date_string, timezone = pop_tz_offset_from_string(date_string, as_offset=False)
 
-        date_string = self._cached._simplify(date_string)
-        tokens = self._cached._split(date_string, keep_formatting=False)
-        if self._cached._is_date_consists_of_digits_only(tokens):
+        date_string = self._simplify(date_string)
+        tokens = self._split(date_string, keep_formatting=False)
+        if self._is_date_consists_of_digits_only(tokens):
             return True
         else:
-            return self._cached._are_all_words_in_the_dictionary(tokens)
+            return self._are_all_words_in_the_dictionary(tokens)
 
     def translate(self, date_string, keep_formatting=False):
-        tokens = self._cached._get_new_skip_tokens(settings.SKIP_TOKENS)
-
-        if tokens:
-            self._cached.info['skip'] += tokens
-            self._cached = Language(self._language_object.shortname, self._language_object.info)
-
-        date_string = self._cached._simplify(date_string)
-        words = self._cached._split(date_string, keep_formatting)
+        date_string = self._simplify(date_string)
+        words = self._split(date_string, keep_formatting)
 
-        dictionary = self._cached._get_dictionary()
+        dictionary = self._get_dictionary()
         for i, word in enumerate(words):
             word = word.lower()
             if word in dictionary:
                 words[i] = dictionary[word] or ''
 
-        return self._cached._join(filter(bool, words), separator="" if keep_formatting else " ")
-
-    def _get_new_skip_tokens(self, tokens):
-        return [token for token in tokens if token not in self._cached.info.get('skip', [])]
+        return self._join(filter(bool, words), separator="" if keep_formatting else " ")
 
     def _simplify(self, date_string):
         date_string = date_string.lower()
-        for simplification in self._cached.info.get('simplifications', []):
+        for simplification in self.info.get('simplifications', []):
             pattern, replacement = simplification.items()[0]
-            if not self._cached.info.get('no_word_spacing', False):
+            if not self.info.get('no_word_spacing', False):
                 replacement = wrap_replacement_for_regex(replacement, pattern)
                 pattern = ur'(\A|\d|_|\W)%s(\d|_|\W|\Z)' % pattern
             date_string = re.sub(pattern, replacement, date_string, flags=re.IGNORECASE | re.UNICODE).lower()
@@ -84,7 +71,7 @@ def _is_date_consists_of_digits_only(self, tokens):
             return True
 
     def _are_all_words_in_the_dictionary(self, words):
-        dictionary = self._cached._get_dictionary()
+        dictionary = self._get_dictionary()
         for word in words:
             word = word.lower()
             if word.isdigit() or word in dictionary:
@@ -96,8 +83,8 @@ def _are_all_words_in_the_dictionary(self, words):
 
     def _split(self, date_string, keep_formatting):
         tokens = [date_string]
-        tokens = self._cached._split_tokens_with_regex(tokens, "(\d+)")
-        tokens = self._cached._split_tokens_by_known_words(tokens, keep_formatting)
+        tokens = self._split_tokens_with_regex(tokens, "(\d+)")
+        tokens = self._split_tokens_by_known_words(tokens, keep_formatting)
         return tokens
 
     def _split_tokens_with_regex(self, tokens, regex):
@@ -107,7 +94,7 @@ def _split_tokens_with_regex(self, tokens, regex):
         return filter(bool, chain(*tokens))
 
     def _split_tokens_by_known_words(self, tokens, keep_formatting):
-        dictionary = self._cached._get_dictionary()
+        dictionary = self._get_dictionary()
         for i, token in enumerate(tokens):
             tokens[i] = dictionary.split(token, keep_formatting)
         return list(chain(*tokens))
@@ -116,7 +103,7 @@ def _join(self, tokens, separator=" "):
         if not tokens:
             return ""
 
-        capturing_splitters = self._cached._get_splitters()['capturing']
+        capturing_splitters = self._get_splitters()['capturing']
         joined = tokens[0]
         for i in range(1, len(tokens)):
             left, right = tokens[i - 1], tokens[i]
@@ -127,19 +114,19 @@ def _join(self, tokens, separator=" "):
         return joined
 
     def _get_dictionary(self):
-        if self._cached._dictionary is None:
-            self._cached._generate_dictionary()
-        return self._cached._dictionary
+        if self._dictionary is None:
+            self._generate_dictionary()
+        return self._dictionary
 
     def _get_wordchars(self):
-        if self._cached._wordchars is None:
-            self._cached._set_wordchars()
-        return self._cached._wordchars
+        if self._wordchars is None:
+            self._set_wordchars()
+        return self._wordchars
 
     def _get_splitters(self):
-        if self._cached._splitters is None:
-            self._cached._set_splitters()
-        return self._cached._splitters
+        if self._splitters is None:
+            self._set_splitters()
+        return self._splitters
 
     def _set_splitters(self):
         splitters = {
@@ -148,28 +135,28 @@ def _set_splitters(self):
         }
         splitters['capturing'] |= set(ALWAYS_KEEP_TOKENS)
 
-        wordchars = self._cached._get_wordchars()
-        skip = set(self._cached.info.get('skip', [])) | splitters['capturing']
+        wordchars = self._get_wordchars()
+        skip = set(self.info.get('skip', [])) | splitters['capturing']
         for token in skip:
             if not re.match('^\W+$', token, re.UNICODE):
                 continue
             if token in wordchars:
                 splitters['wordchars'].add(token)
 
-        self._cached._splitters = splitters
+        self._splitters = splitters
 
     def _set_wordchars(self):
         wordchars = set()
-        for word in self._cached._get_dictionary():
+        for word in self._get_dictionary():
             if re.match('^[\W\d_]+$', word, re.UNICODE):
                 continue
             for char in word:
                 wordchars.add(char.lower())
 
-        self._cached._wordchars = wordchars - {" "} | {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}
+        self._wordchars = wordchars - {" "} | {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}
 
     def _generate_dictionary(self):
-        self._cached._dictionary = Dictionary(self.info)
+        self._dictionary = Dictionary(self.info)
 
     def to_parserinfo(self, base_cls=parser.parserinfo):
         attributes = {
diff --git a/dateparser/languages/loader.py b/dateparser/languages/loader.py
@@ -4,7 +4,6 @@
 from yaml import load as load_yaml
 
 from .language import Language
-from ..conf import settings
 
 
 class LanguageDataLoader(object):
@@ -36,9 +35,7 @@ def _load_data(self):
         else:
             data = self.file.read()
         data = load_yaml(data)
-        base_data = data.pop('base', {'skip': []})
-        # also add any skip tokens from custom settings
-        base_data['skip'] += settings.SKIP_TOKENS
+        base_data = data.pop('base', {})
         known_languages = {}
         for shortname, language_info in data.iteritems():
             self._update_language_info_with_base_info(language_info, base_data)

-Original file line number
+Diff line change
@@ @@ -1,5 +1,5 @@ @@
 base:
 -    skip: [" ", ".", ",", ";", "-", "/", "'", "|", "@", "[", "]", "，",]
 +    skip: [" ", ".", ",", ";", "-", "/", "'", "|", "@", "[", "]", "，", "t"]
 en: