* Bug #12475 fixed - csvRead() does not support double quoted fields. 91/11191/2
Simon Marchetto [Tue, 2 Apr 2013 16:58:41 +0000 (18:58 +0200)]
+ refactoring of splitLineCSV():
  - centralized code in addToken()
  -  remove useless argument meta

Change-Id: I25c508856e3b878d362eca60802bf9ccfbbd0fbc

scilab/CHANGES_5.5.X
scilab/modules/fileio/tests/unit_tests/double_quotes.csv [new file with mode: 0644]
scilab/modules/spreadsheet/src/c/csvRead.c
scilab/modules/spreadsheet/src/c/splitLine.c
scilab/modules/spreadsheet/src/c/splitLine.h
scilab/modules/spreadsheet/tests/nonreg_tests/bug_12475.csv [new file with mode: 0644]
scilab/modules/spreadsheet/tests/nonreg_tests/bug_12475.dia.ref [new file with mode: 0644]
scilab/modules/spreadsheet/tests/nonreg_tests/bug_12475.tst [new file with mode: 0644]
scilab/modules/spreadsheet/tests/unit_tests/csvTextScan.dia.ref
scilab/modules/spreadsheet/tests/unit_tests/csvTextScan.tst
scilab/modules/spreadsheet/tests/unit_tests/double_quotes.csv [new file with mode: 0644]

index 720e26c..47a8815 100644 (file)
@@ -66,3 +66,4 @@ Bug fixes
 * Bug #12443 fixed - Document behavior of mopen() in text file mode
                      on Windows.
 
+* Bug #12475 fixed - csvRead() does not support double quoted fields.
diff --git a/scilab/modules/fileio/tests/unit_tests/double_quotes.csv b/scilab/modules/fileio/tests/unit_tests/double_quotes.csv
new file mode 100644 (file)
index 0000000..5ebb8d3
--- /dev/null
@@ -0,0 +1,5 @@
+Dummy1,Dummy1
+Dummy2,"Dummy2, Dummy2"
+Dummy3,"(""Dummy3"")"
+"""Dummy4"" Dummy4","Dummy4"
+"Dummy5","Dummy5 ""Dummy5"""
index 4070163..94de7ec 100644 (file)
@@ -373,7 +373,7 @@ static int getNumbersOfColumnsInLine(const char *line, const char *separator)
     {
         int i = 0;
         int nbTokens = 0;
-        char **splittedStr = splitLineCSV(line, separator, &nbTokens, 0);
+        char **splittedStr = splitLineCSV(line, separator, &nbTokens);
         if (splittedStr)
         {
             freeArrayOfString(splittedStr, nbTokens);
@@ -419,7 +419,7 @@ static char **getStringsFromLines(const char **lines, int sizelines,
         for (i = 0; i < sizelines; i++)
         {
             int nbTokens = 0;
-            char **lineStrings = splitLineCSV(lines[i], separator, &nbTokens, 0);
+            char **lineStrings = splitLineCSV(lines[i], separator, &nbTokens);
             int j = 0;
 
             if (lineStrings == NULL)
index f0b8f92..e80060d 100644 (file)
 #include "csv_strsubst.h"
 #include "MALLOC.h"
 #include "freeArrayOfString.h"
+
+#define EMPTYFIELD "__EMPTY_FIELD_CSV__"
+#define DOUBLE_QUOTE '"'
+
+// Add the token (string) to the array of tokens,
+// and applies post processing (escape double quotes,...)
+static int addToken(char **tokens, int *tokenIdx, const char* tokenValue, int tokenLen)
+{
+    char *token = (char *) MALLOC((sizeof(char) * tokenLen) + 1);
+
+    if (token)
+    {
+        char *token2;
+        const char *c, *c_end;
+        char *c2;
+
+        memcpy(token, tokenValue, tokenLen);
+        token[tokenLen] = 0;
+
+        if (strcmp(token, EMPTYFIELD) == 0)
+        {
+            strcpy(token, "");
+        }
+
+        // Escape double quotes, and remove simple quotes
+        token2 = (char *) MALLOC((sizeof(char) * tokenLen) + 1);
+        c = token;
+        c_end = c + tokenLen;
+        c2 = token2;
+        while (c < c_end)
+        {
+            if (*c == DOUBLE_QUOTE)
+            {
+                c++;
+                if (*c == DOUBLE_QUOTE)
+                {
+                    *c2 = DOUBLE_QUOTE;
+                    c++;
+                    c2++;
+                }
+            }
+            else
+            {
+                *c2 = *c;
+                c++;
+                c2++;
+            }
+        }
+        *c2 = 0;
+
+        // Add token
+        tokens[*tokenIdx] = token2;
+        (*tokenIdx)++;
+
+        FREE(token);
+
+        return TRUE;
+    }
+    return FALSE;
+}
+
 /* ==================================================================== */
-char **splitLineCSV(const char *str, const char *sep, int *toks, char meta)
+char **splitLineCSV(const char *str, const char *sep, int *toks)
 {
-#define EMPTYFIELD "__EMPTY_FIELD_CSV__"
     char **retstr = NULL;
     const char *idx = NULL;
     const char *end = NULL;
@@ -26,7 +86,7 @@ char **splitLineCSV(const char *str, const char *sep, int *toks, char meta)
     const char *sep_idx = NULL;
     int len = 0;
     int curr_str = 0;
-    char last_char = 0xFF;
+    int inDoubleQuote = 0;
 
     /* Usually, it should be ,, or ;; */
     char tokenstring_to_search[64] = "";
@@ -61,20 +121,17 @@ char **splitLineCSV(const char *str, const char *sep, int *toks, char meta)
         sprintf(tmp, "%s%s", substitutedstring, EMPTYFIELD);
         FREE(substitutedstring);
         substitutedstring = tmp;
-
     }
 
     sep_end = sep + strlen(sep);
     end = substitutedstring + strlen(substitutedstring);
 
-    sep_idx = sep;
     idx = substitutedstring;
 
     if (strstr(substitutedstring, sep) == NULL)
     {
         *toks = 0;
         FREE(substitutedstring);
-        substitutedstring = NULL;
         return NULL;
     }
 
@@ -83,101 +140,101 @@ char **splitLineCSV(const char *str, const char *sep, int *toks, char meta)
     {
         *toks = 0;
         FREE(substitutedstring);
-        substitutedstring = NULL;
         return NULL;
     }
 
     while (idx < end)
     {
-        while (sep_idx < sep_end)
+        // If we are in a double quoted field, we do not plit on separators
+        if (!inDoubleQuote)
         {
-            if ((*idx == *sep_idx) && (last_char != meta))
+            sep_idx = sep;
+            while (sep_idx < sep_end)
             {
-                if (len > 0)
+                if ((*idx == *sep_idx))
                 {
-                    if (curr_str < (int)strlen(substitutedstring))
+                    if (len > 0)
                     {
-                        retstr[curr_str] = (char *) MALLOC((sizeof(char) * len) + 1);
-
-                        if (retstr[curr_str] == NULL)
+                        if (curr_str < (int)strlen(substitutedstring))
                         {
-                            *toks = 0;
-                            FREE(substitutedstring);
-                            substitutedstring = NULL;
-                            freeArrayOfString(retstr, strlen(substitutedstring));
-                            return NULL;
+                            // New token (= field)
+                            if (addToken(retstr, &curr_str, (char*)(idx - len), len))
+                            {
+                                // Reset for next field
+                                len = 0;
+                                idx++;
+                            }
+                            else
+                            {
+                                *toks = 0;
+                                FREE(substitutedstring);
+                                freeArrayOfString(retstr, strlen(substitutedstring));
+                                return NULL;
+                            }
                         }
-                        memcpy(retstr[curr_str], (idx - len), len);
-                        retstr[curr_str][len] = 0;
-                        if (strcmp(retstr[curr_str], EMPTYFIELD) == 0)
+
+                        if (curr_str >= (int)strlen(substitutedstring))
                         {
-                            strcpy(retstr[curr_str], "");
+                            *toks = curr_str + 1;
+                            FREE(substitutedstring);
+                            return retstr;
                         }
-                        len = 0;
-                        curr_str++;
-                        last_char = *idx;
-                        idx++;
                     }
-
-                    if (curr_str >= (int)strlen(substitutedstring))
+                    else
                     {
-                        *toks = curr_str + 1;
-                        FREE(substitutedstring);
-                        substitutedstring = NULL;
-                        return retstr;
+                        idx++;
+                        len = 0;
                     }
                 }
                 else
                 {
-                    last_char = *idx;
-                    idx++;
-                    sep_idx = sep;
-                    len = 0;
+                    sep_idx++;
                 }
             }
-            else
+        }
+
+        if (*idx == '"')
+        {
+            // Count number of consecutive double quotes
+            int nbDoubleQuotes = 0;
+            const char *idxTmp = idx;
+
+            while (*idxTmp == '"')
             {
-                sep_idx++;
+                *idxTmp++;
             }
-        }
+            nbDoubleQuotes = idxTmp - idx;
 
-        sep_idx = sep;
-        len++;
-        last_char = *idx;
-        idx++;
+            // if it is odd, we enter or leave a double quoted field
+            if (nbDoubleQuotes % 2 == 1)
+            {
+                inDoubleQuote = (inDoubleQuote == 0) ? 1 : 0;
+            }
+            len += nbDoubleQuotes;
+            idx += nbDoubleQuotes;
+        }
+        else
+        {
+            len++;
+            idx++;
+        }
     }
 
     if (len > 0)
     {
-        retstr[curr_str] = (char *) MALLOC((sizeof(char) * len) + 1);
-
-        if (retstr[curr_str] == NULL)
+        // New token (= field)
+        if (!addToken(retstr, &curr_str, (char*)(idx - len), len))
         {
             *toks = 0;
-            if (substitutedstring)
-            {
-                FREE(substitutedstring);
-                substitutedstring = NULL;
-            }
+            FREE(substitutedstring);
             freeArrayOfString(retstr, strlen(substitutedstring));
             return NULL;
         }
-
-        memcpy(retstr[curr_str], (idx - len), len);
-        retstr[curr_str][len] = 0;
-        if (strcmp(retstr[curr_str], EMPTYFIELD) == 0)
-        {
-            strcpy(retstr[curr_str], "");
-        }
-
-        *toks = curr_str + 1;
     }
 
-    if (substitutedstring)
-    {
-        FREE(substitutedstring);
-        substitutedstring = NULL;
-    }
+    *toks = curr_str;
+
+    FREE(substitutedstring);
 
     return retstr;
 }
index 879b541..856b859 100644 (file)
@@ -19,7 +19,7 @@ extern "C" {
     /**
     * split a line by separator
     */
-    char **splitLineCSV(const char *str, const char *sep, int *toks, char meta);
+    char **splitLineCSV(const char *str, const char *sep, int *toks);
 
 #ifdef __cplusplus
 }
diff --git a/scilab/modules/spreadsheet/tests/nonreg_tests/bug_12475.csv b/scilab/modules/spreadsheet/tests/nonreg_tests/bug_12475.csv
new file mode 100644 (file)
index 0000000..d412cc9
--- /dev/null
@@ -0,0 +1,2 @@
+Model,MDO4014-3
+FREQUENCY,"MATH<FFT(CH1, BLACKMANHARRIS, LOGRMS)>"
diff --git a/scilab/modules/spreadsheet/tests/nonreg_tests/bug_12475.dia.ref b/scilab/modules/spreadsheet/tests/nonreg_tests/bug_12475.dia.ref
new file mode 100644 (file)
index 0000000..e33211c
--- /dev/null
@@ -0,0 +1,19 @@
+// =============================================================================
+// Scilab ( http://www.scilab.org/ ) - This file is part of Scilab
+// Copyright (C) 2013 - Scilab Enterprises - Simon MARCHETTO
+//
+//  This file is distributed under the same license as the Scilab package.
+// =============================================================================
+//
+// <-- Non-regression test for bug 12475 -->
+//
+// <-- Bugzilla URL -->
+// http://bugzilla.scilab.org/show_bug.cgi?id=12475
+//
+// <-- Short Description -->
+// csvRead does not support double quotes
+csvPath = SCI + "/modules/spreadsheet/tests/nonreg_tests/bug_12475.csv";
+val = csvRead(csvPath, ",", [], "string");
+ref = ["Model", "MDO4014-3"; ..
+       "FREQUENCY", "MATH<FFT(CH1, BLACKMANHARRIS, LOGRMS)>"];
+assert_checkequal(val, ref);
diff --git a/scilab/modules/spreadsheet/tests/nonreg_tests/bug_12475.tst b/scilab/modules/spreadsheet/tests/nonreg_tests/bug_12475.tst
new file mode 100644 (file)
index 0000000..64311fb
--- /dev/null
@@ -0,0 +1,23 @@
+// =============================================================================
+// Scilab ( http://www.scilab.org/ ) - This file is part of Scilab
+// Copyright (C) 2013 - Scilab Enterprises - Simon MARCHETTO
+//
+//  This file is distributed under the same license as the Scilab package.
+// =============================================================================
+//
+// <-- Non-regression test for bug 12475 -->
+//
+// <-- Bugzilla URL -->
+// http://bugzilla.scilab.org/show_bug.cgi?id=12475
+//
+// <-- Short Description -->
+// csvRead does not support double quotes
+
+csvPath = SCI + "/modules/spreadsheet/tests/nonreg_tests/bug_12475.csv";
+val = csvRead(csvPath, ",", [], "string");
+
+ref = ["Model", "MDO4014-3"; ..
+       "FREQUENCY", "MATH<FFT(CH1, BLACKMANHARRIS, LOGRMS)>"];
+
+assert_checkequal(val, ref);
+
index 1fd0943..39c1c15 100644 (file)
@@ -9,8 +9,8 @@ B = "        4;        5;     6";
 C = [A;B];
 bbSTR = csvTextScan(C, ';', [], "string");
 expected = [
-"        1" , "        2" , "     3" 
-"        4" , "        5" , "     6" 
+"        1" , "        2" , "     3"
+"        4" , "        5" , "     6"
 ];
 assert_checkequal ( bbSTR , expected );
 // =============================================================================
@@ -128,3 +128,13 @@ b = csvTextScan(r, "|", ".", "double");
 ref = [%nan , 2, %nan; %nan, 3, %nan];
 assert_checkequal ( b , ref);
 // =============================================================================
+r = mgetl(fullfile(path, "double_quotes.csv"), 5);
+ref = ['Dummy1', 'Dummy1'; ..
+       'Dummy2', 'Dummy2, Dummy2'; ..
+       'Dummy3', '(""Dummy3"")'; ..
+       '""Dummy4"" Dummy4','Dummy4'; ..
+       'Dummy5', 'Dummy5 ""Dummy5""'];
+for i=1:5
+    b = csvTextScan(r(i), ",", [], "string");
+    assert_checkequal(b , ref(i,:));
+end
index 9c2c44e..39c1c15 100644 (file)
@@ -9,8 +9,8 @@ B = "        4;        5;     6";
 C = [A;B];
 bbSTR = csvTextScan(C, ';', [], "string");
 expected = [
-"        1" , "        2" , "     3" 
-"        4" , "        5" , "     6" 
+"        1" , "        2" , "     3"
+"        4" , "        5" , "     6"
 ];
 assert_checkequal ( bbSTR , expected );
 // =============================================================================
@@ -128,4 +128,13 @@ b = csvTextScan(r, "|", ".", "double");
 ref = [%nan , 2, %nan; %nan, 3, %nan];
 assert_checkequal ( b , ref);
 // =============================================================================
-
+r = mgetl(fullfile(path, "double_quotes.csv"), 5);
+ref = ['Dummy1', 'Dummy1'; ..
+       'Dummy2', 'Dummy2, Dummy2'; ..
+       'Dummy3', '(""Dummy3"")'; ..
+       '""Dummy4"" Dummy4','Dummy4'; ..
+       'Dummy5', 'Dummy5 ""Dummy5""'];
+for i=1:5
+    b = csvTextScan(r(i), ",", [], "string");
+    assert_checkequal(b , ref(i,:));
+end
diff --git a/scilab/modules/spreadsheet/tests/unit_tests/double_quotes.csv b/scilab/modules/spreadsheet/tests/unit_tests/double_quotes.csv
new file mode 100644 (file)
index 0000000..5ebb8d3
--- /dev/null
@@ -0,0 +1,5 @@
+Dummy1,Dummy1
+Dummy2,"Dummy2, Dummy2"
+Dummy3,"(""Dummy3"")"
+"""Dummy4"" Dummy4","Dummy4"
+"Dummy5","Dummy5 ""Dummy5"""