Merge pull request #82 from jb-kerboeuf/master

chatscriptnlp · web-flow · commit 89dba7e10f8f · 2017-03-12T15:10:26.000-07:00
misc french changes
diff --git a/SRC/spellcheck.cpp b/SRC/spellcheck.cpp
@@ -20,6 +20,66 @@ static SUFFIX stems[] =
 	{0},
 };
 
+static SUFFIX stems_french[] = 
+{
+	{ (char*)"âtre",ADJECTIVE},
+	{ (char*)"able",ADJECTIVE},
+	{ (char*)"ade",NOUN},
+	{ (char*)"age",NOUN},
+	{ (char*)"aille",NOUN},
+	{ (char*)"ain",NOUN|ADJECTIVE},
+	{ (char*)"ais",NOUN|ADJECTIVE},
+	{ (char*)"al",ADJECTIVE},
+	{ (char*)"ance",NOUN},
+	{ (char*)"ant",ADJECTIVE},
+	{ (char*)"ard",ADJECTIVE},
+	{ (char*)"aud",ADJECTIVE},
+	{ (char*)"ère",NOUN},
+	{ (char*)"ée",NOUN},
+	{ (char*)"el",ADJECTIVE},
+	{ (char*)"et",ADJECTIVE},
+	{ (char*)"esse",NOUN},
+	{ (char*)"eur",ADJECTIVE|NOUN},
+	{ (char*)"euse",NOUN},
+	{ (char*)"eux",ADJECTIVE},
+	{ (char*)"ible",ADJECTIVE},
+	{ (char*)"isme",NOUN},
+	{ (char*)"iste",NOUN|ADJECTIVE},
+	{ (char*)"ien",NOUN|ADJECTIVE},
+	{ (char*)"ier",NOUN},
+	{ (char*)"ie",NOUN},
+	{ (char*)"if",ADJECTIVE},
+	{ (char*)"in",ADJECTIVE},
+	{ (char*)"ir",VERB},
+	{ (char*)"asser",VERB},
+	{ (char*)"ater",VERB},
+	{ (char*)"ailler",VERB},
+	{ (char*)"ifier",VERB},
+	{ (char*)"iner",VERB},
+	{ (char*)"iser",VERB},
+	{ (char*)"oter",VERB},
+	{ (char*)"ot",ADJECTIVE},
+	{ (char*)"oyer",VERB},
+	{ (char*)"er",VERB|NOUN},
+	{ (char*)"ment",ADVERB|NOUN},
+	{ (char*)"ois",NOUN},
+	{ (char*)"son",NOUN},
+	{ (char*)"tion",NOUN},
+	{ (char*)"ure",NOUN},
+	{ (char*)"logue",NOUN},
+	{ (char*)"logie",NOUN},
+	{ (char*)"gène",NOUN},
+	{ (char*)"gramme",NOUN},
+	{ (char*)"manie",NOUN},
+	{ (char*)"phobe",NOUN},
+	{ (char*)"phobie",NOUN},
+	{ (char*)"ose",NOUN},
+	{0},
+};
+
+
+
+
 bool multichoice = false;
 
 void InitSpellCheck()
@@ -70,7 +130,8 @@ static int SplitWord(char* word)
 	size_t len = strlen(word);
     for (unsigned int k = 1; k < len-1; ++k)
     {
-        if (k == 1 &&*word != 'a' &&*word != 'A' &&*word != 'i' &&*word != 'I') continue; //   only a and i are allowed single-letter words
+        if (!stricmp(language,"english") && k == 1 &&*word != 'a' &&*word != 'A' &&*word != 'i' &&*word != 'I') continue; //   only a and i are allowed single-letter words
+        else if (!stricmp(language,"french") && k == 1 &&*word != 'y' &&*word != 'a' &&*word != 'A' &&*word != 'à' &&*word != 'À' &&*word != 'ô' &&*word != 'Ô') continue; //   in french only y, a and ô are allowed single-letter words
 		WORDP D1 = FindWord(word,k,PRIMARY_CASE_ALLOWED);
         if (!D1) continue;
 		good = (D1->properties & (PART_OF_SPEECH|FOREIGN_WORD)) != 0 || (D1->internalBits & HAS_SUBSTITUTE) != 0; 
@@ -829,6 +890,94 @@ static int EditDistance(WORDINFO& dictWordData, WORDINFO& realWordData,int min)
                 continue;
             }
         }
+        // french common bad spellings
+        if (!stricmp(language, "french"))
+        {
+            if (*currentCharReal == 'a' && *currentCharDict == 'â')
+            {
+            	val += 1;
+                dictinfo = resumeDict;
+                realinfo = resumeReal;
+                continue;
+            }
+            if (*currentCharReal == 'e' && *currentCharDict == 'ê')
+            {
+            	val += 10;
+                dictinfo = resumeDict;
+                realinfo = resumeReal;
+                continue;
+            }
+            if (*currentCharReal == 'è' && *currentCharDict == 'ê')
+            {
+            	val += 5;
+                dictinfo = resumeDict;
+                realinfo = resumeReal;
+                continue;
+            }
+            if (*currentCharReal == 'i' && *currentCharDict == 'î')
+            {
+            	val += 1;
+                dictinfo = resumeDict;
+                realinfo = resumeReal;
+                continue;
+            }
+            if (*currentCharReal == 'o' && *currentCharDict == 'ô')
+            {
+            	val += 1;
+                dictinfo = resumeDict;
+                realinfo = resumeReal;
+                continue;
+            }
+            if (*currentCharReal == 'u' && *currentCharDict == 'û')
+            {
+            	val += 5;
+                dictinfo = resumeDict;
+                realinfo = resumeReal;
+                continue;
+            }
+            if (*currentCharReal == 'y' && *currentCharDict == 'l' && *nextCharDict == 'l')
+            {
+            	val += 10;
+                dictinfo = resumeDict1;
+                realinfo = resumeReal;
+                continue;
+            }
+            if (*currentCharReal == 'k' && *currentCharDict == 'q' && *nextCharDict == 'u')
+            {
+            	val += 10;
+                dictinfo = resumeDict1;
+                realinfo = resumeReal;
+                continue;
+            }
+            if (*currentCharReal == 'f' && *currentCharDict == 'p' && *nextCharDict == 'h')
+            {
+            	val += 5;
+                dictinfo = resumeDict1;
+                realinfo = resumeReal;
+                continue;
+            }
+            if (*currentCharReal == 's' && *currentCharDict == 'c')
+            {
+            	val += 10;
+                dictinfo = resumeDict;
+                realinfo = resumeReal;
+                continue;
+            }
+            if (*currentCharReal == 's' && *currentCharDict == 'ç')
+            {
+            	val += 5;
+                dictinfo = resumeDict;
+                realinfo = resumeReal;
+                continue;
+            }
+            if (*currentCharReal == 'c' && *currentCharDict == 'ç')
+            {
+            	val += 5;
+                dictinfo = resumeDict;
+                realinfo = resumeReal;
+                continue;
+            }
+        }
         // probable transposition since swapping syncs up
         if (!strcmp(currentCharReal, nextCharDict) && !strcmp(nextCharReal, currentCharDict))
         {
@@ -927,19 +1076,41 @@ static char* StemSpell(char* word,unsigned int i,uint64& base)
 	{
 		unsigned int i = 0;
 		char* suffix;
-		while ((suffix = stems[i].word))
+		if (!stricmp(language, "english")) 
 		{
-			uint64 kind = stems[i++].flags;
-			size_t suffixlen = strlen(suffix);
-			if (!strnicmp(word+len-suffixlen,suffix,suffixlen))
+			while ((suffix = stems[i].word))
 			{
-				word1[len-suffixlen] = 0;
-				best = SpellFix(word1,0,kind); 
-				if (best) 
+				uint64 kind = stems[i++].flags;
+				size_t suffixlen = strlen(suffix);
+				if (!strnicmp(word+len-suffixlen,suffix,suffixlen))
 				{
-					ending = suffix;
-                    base = stems[i].flags;
-					break;
+					word1[len-suffixlen] = 0;
+					best = SpellFix(word1,0,kind); 
+					if (best) 
+					{
+						ending = suffix;
+	                    base = stems[i].flags;
+						break;
+					}
+				}
+			}
+		}
+		else if (!stricmp(language, "french")) 
+		{
+			while ((suffix = stems_french[i].word))
+			{
+				uint64 kind = stems_french[i++].flags;
+				size_t suffixlen = strlen(suffix);
+				if (!strnicmp(word+len-suffixlen,suffix,suffixlen))
+				{
+					word1[len-suffixlen] = 0;
+					best = SpellFix(word1,0,kind); 
+					if (best) 
+					{
+						ending = suffix;
+	                    base = stems_french[i].flags;
+						break;
+					}
 				}
 			}
 		}
diff --git a/SRC/textUtilities.cpp b/SRC/textUtilities.cpp
@@ -60,7 +60,42 @@ NUMBERDECODE numberValues[] = {
  { (char*)"gross",144,5,0},
  { (char*)"thousand",1000,8,REALNUMBER},
  { (char*)"million",1000000,7,REALNUMBER},
- { (char*)"billion",1000000,7,REALNUMBER},
+ { (char*)"billion",1000000000,7,REALNUMBER},
+};
+
+NUMBERDECODE numberValuesFrench[] = { 
+ { (char*)"zéro",0,4,REALNUMBER}, { (char*)"zero",0,4,REALNUMBER},
+ { (char*)"un",1,2,REALNUMBER}, { (char*)"une",1,3}, { (char*)"premier",1,7}, { (char*)"première",1,8}, { (char*)"mono",1,4,0}, { (char*)"uni",1,3,0}, { (char*)"one",1,3,0},
+ { (char*)"deux",2,4,REALNUMBER}, { (char*)"second",2,6}, { (char*)"seconde",2,7}, { (char*)"deuxième",2,8}, { (char*)"double",2,6,0}, { (char*)"bi",2,2,0}, { (char*)"paire",2,5,0}, { (char*)"moitié",2,5,FRACTION_NUMBER}, { (char*)"demi",2,4,FRACTION_NUMBER}, { (char*)"two",2,3,0},
+ { (char*)"trois",3,5,REALNUMBER}, { (char*)"troisième",3,9}, { (char*)"triple",3,6,0}, { (char*)"tiers",3,5,FRACTION_NUMBER}, { (char*)"three",3,5,0}, { (char*)"III",3,3,0},
+ { (char*)"quatre",4,6,REALNUMBER}, { (char*)"quatrième",4,9}, { (char*)"quart",4,5,FRACTION_NUMBER}, { (char*)"four",4,4,0}, { (char*)"IV",4,2,0},
+ { (char*)"cinq",5,4,REALNUMBER}, { (char*)"cinqième",5,8}, { (char*)"five",5,4,0}, { (char*)"V",5,1,0},
+ { (char*)"six",6,3,REALNUMBER}, { (char*)"sixième",6,7}, { (char*)"VI",6,2,0},
+ { (char*)"sept",7,4,REALNUMBER}, { (char*)"septième",7,8}, { (char*)"seven",7,5,0}, { (char*)"VII",7,3,0},
+ { (char*)"huit",8,4,REALNUMBER}, { (char*)"huitième",8,8}, { (char*)"eight",8,5,0}, { (char*)"VIII",8,4,0},
+ { (char*)"neuf",9,4,REALNUMBER}, { (char*)"neuvième",9,8}, { (char*)"nine",9,4,0}, { (char*)"IX",9,2,0},
+ { (char*)"dix",10,3,REALNUMBER}, { (char*)"dixième",10,7}, { (char*)"dizaine",10,7,0}, { (char*)"ten",10,3,0}, { (char*)"X",10,1,0},
+ { (char*)"onze",11,4,REALNUMBER}, { (char*)"onzième",11,7}, { (char*)"XI",11,2,0},
+ { (char*)"douze",12,5,REALNUMBER}, { (char*)"douzième",12,8}, { (char*)"douzaine",12,8,0}, { (char*)"XII",12,3,0},
+ { (char*)"treize",13,6,REALNUMBER}, { (char*)"treizième",13,9}, { (char*)"XIII",13,4,0},
+ { (char*)"quatorze",14,8,REALNUMBER}, { (char*)"quatorzième",14,11}, { (char*)"XIV",14,3,0},
+ { (char*)"quinze",15,6,REALNUMBER}, { (char*)"quinzième",15,9}, { (char*)"XV",15,2,0},
+ { (char*)"seize",16,5,REALNUMBER}, { (char*)"seizième",16,8}, { (char*)"XVI",16,3,0},
+ { (char*)"dix-sept",17,8,REALNUMBER}, { (char*)"dix-septième",17,12}, { (char*)"XVII",17,4,0},
+ { (char*)"dix-huit",18,8,REALNUMBER}, { (char*)"dix-huitième",18,12}, { (char*)"XVIII",18,5,0},
+ { (char*)"dix-neuf",19,8,REALNUMBER}, { (char*)"dix-neuvième",19,12}, { (char*)"XIX",19,3,0},
+ { (char*)"vingt",20,5,REALNUMBER}, { (char*)"vingtième",20,9}, { (char*)"XX",20,2,0},
+ { (char*)"trente",30,6,REALNUMBER}, { (char*)"trentième",30,9},
+ { (char*)"quarante",40,8,REALNUMBER}, { (char*)"quarantième",40,11},
+ { (char*)"cinquante",50,9,REALNUMBER}, { (char*)"cinquantième",50,12},
+ { (char*)"soixante",60,8,REALNUMBER}, { (char*)"soixantième",60,11},
+ { (char*)"soixante-dix",70,12,REALNUMBER}, { (char*)"septante",70,8,REALNUMBER}, { (char*)"soixante-dixième",70,17},
+ { (char*)"quatre-vingt",80,12,REALNUMBER}, { (char*)"octante",80,7,REALNUMBER}, { (char*)"quatre-vingtième",80,16},
+ { (char*)"quatre-vingt-dix",90,16,REALNUMBER}, { (char*)"nonante",90,7,REALNUMBER}, { (char*)"quatre-vingt-dixième",90,20},
+ { (char*)"cent",100,4,REALNUMBER}, { (char*)"cents",100,5,REALNUMBER}, { (char*)"centième",100,8}, { (char*)"centaine",100,8,0},
+ { (char*)"mille",1000,5,REALNUMBER}, { (char*)"millième",1000,8}, { (char*)"millier",1000,7,0},
+ { (char*)"million",1000000,7,REALNUMBER}, { (char*)"millions",1000000,8,REALNUMBER}, { (char*)"millionième",1000000,11},
+ { (char*)"milliard",1000000000,8,REALNUMBER}, { (char*)"milliards",1000000000,9,REALNUMBER}, { (char*)"milliardième",1000000000,12},
 };
 
 char toHex[16] = {
@@ -2445,13 +2480,20 @@ int64 Convert2Integer(char* number)  //  non numbers return NOT_A_NUMBER
 	if (hyp) *hyp = '-';
 
 	// look up direct word numbers
-	if (!hasDigit) for (unsigned int i = 0; i < sizeof(numberValues)/sizeof(NUMBERDECODE); ++i)
+	if (!stricmp(language, "english") && !hasDigit) for (unsigned int i = 0; i < sizeof(numberValues)/sizeof(NUMBERDECODE); ++i)
     {
         if (len == numberValues[i].length && !strnicmp(word,numberValues[i].word,len)) 
 		{
 			return numberValues[i].value;  // a match (but may be a fraction number)
 		}
     }
+    else if (!stricmp(language, "french") && !hasDigit) for (unsigned int i = 0; i < sizeof(numberValuesFrench)/sizeof(NUMBERDECODE); ++i)
+    {
+        if (len == numberValuesFrench[i].length && !strnicmp(word,numberValuesFrench[i].word,len)) 
+		{
+			return numberValuesFrench[i].value;  // a match (but may be a fraction number)
+		}
+    }
 
     // try for hyphenated composite
  	char*  hyphen = strchr(word,'-'); 
diff --git a/SRC/tokenSystem.cpp b/SRC/tokenSystem.cpp
@@ -724,7 +724,7 @@ static char* FindWordEnd(char* ptr,char* priorToken,char** words,int &count,bool
 	char* place = ptr;
 	while (IsDigit(*place)) ++place;
 	if (!stricmp(place,"st") || !stricmp(place,"nd") || !stricmp(place,"rd")) return end;
-	else if (stricmp(language, "french") && (!stricmp(place, "er") || !stricmp(place, "ere") || !stricmp(place, "�re") || !stricmp(place, "nd") || !stricmp(place, "nde") || !stricmp(place, "eme") || !stricmp(place, "�me"))) return end;
+	else if (!stricmp(language, "french") && (!stricmp(place, "er") || !stricmp(place, "ere") || !stricmp(place, "�re") || !stricmp(place, "nd") || !stricmp(place, "nde") || !stricmp(place, "eme") || !stricmp(place, "�me"))) return end;
 	int len = end - ptr;
 	char next2;
 	if (*ptr == '/') return ptr+1; // split of things separated
@@ -762,9 +762,11 @@ static char* FindWordEnd(char* ptr,char* priorToken,char** words,int &count,bool
 				}
 				// ' as particle ellision 
 				if ((ptr - start) == 1 && (*start == 'd' || *start == 'c' || *start == 'j' || *start == 'l' || *start == 's' || *start == 't' || *start == 'm' || *start == 'n')) return ptr + 1;  // break off d' argent and other foreign particles
-				else if ((ptr - start) == 2 && *start == 'q' && *(start + 1) == 'u') return ptr + 1;  // break off qu'
-				else if ((ptr - start) == 6 && *start == 'l' && *(start + 1) == 'o' && *(start + 2) == 'r' && *(start + 3) == 's' && *(start + 4) == 'q' && *(start + 5) == 'u') return ptr + 1;  // break off lorsqu'
-				else if ((ptr - start) == 6 && *start == 'p' && *(start + 1) == 'u' && *(start + 2) == 'i' && *(start + 3) == 's' && *(start + 4) == 'q' && *(start + 5) == 'u') return ptr + 1;  // break off puisqu'
+				else if (!stricmp(language, "french") && (ptr - start) == 1 && (*start == 'D' || *start == 'C' || *start == 'J' || *start == 'L' || *start == 'S' || *start == 'T' || *start == 'M' || *start == 'N')) return ptr + 1;  // break off french particles in upper case
+				else if (!stricmp(language, "french") && (ptr - start) == 2 && (*start == 'q' || *start == 'Q') && *(start + 1) == 'u') return ptr + 1;  // break off qu'
+				else if (!stricmp(language, "french") && (ptr - start) == 5 && (*start == 'j' || *start == 'J') && *(start + 1) == 'u' && *(start + 2) == 's' && *(start + 3) == 'q' && *(start + 4) == 'u') return ptr + 1;  // break off jusqu'
+				else if (!stricmp(language, "french") && (ptr - start) == 6 && (*start == 'l' || *start == 'L') && *(start + 1) == 'o' && *(start + 2) == 'r' && *(start + 3) == 's' && *(start + 4) == 'q' && *(start + 5) == 'u') return ptr + 1;  // break off lorsqu'
+				else if (!stricmp(language, "french") && (ptr - start) == 6 && (*start == 'p' || *start == 'P') && *(start + 1) == 'u' && *(start + 2) == 'i' && *(start + 3) == 's' && *(start + 4) == 'q' && *(start + 5) == 'u') return ptr + 1;  // break off puisqu'
 
 				//   12'6" or 12'. or 12' 
 				if (IsDigit(*start) && !IsAlphaUTF8(next)) return ptr + 1;  //   12' swallow ' into number word