* isnum has been redesigned in native code. Up to 130x performance
[scilab.git] / scilab / modules / spreadsheet / src / c / splitLine.c
1 /*
2  * Scilab ( http://www.scilab.org/ ) - This file is part of Scilab
3  * Copyright (C) 2010-2011 - DIGITEO - Allan CORNET
4  *
5  * This file must be used under the terms of the CeCILL.
6  * This source file is licensed as described in the file COPYING, which
7  * you should have received as part of this distribution.  The terms
8  * are also available at
9  * http://www.cecill.info/licences/Licence_CeCILL_V2-en.txt
10  *
11  */
12 #include <string.h>
13 #include <stdio.h>
14 #include "splitLine.h"
15 #include "strsubst.h"
16 #include "MALLOC.h"
17 #include "freeArrayOfString.h"
18
19 #define EMPTYFIELD "__EMPTY_FIELD_CSV__"
20 #define DOUBLE_QUOTE '"'
21
22 // Add the token (string) to the array of tokens,
23 // and applies post processing (escape double quotes,...)
24 static int addToken(char **tokens, int *tokenIdx, const char* tokenValue, int tokenLen)
25 {
26     char *token = (char *) MALLOC((sizeof(char) * tokenLen) + 1);
27
28     if (token)
29     {
30         char *token2;
31         const char *c, *c_end;
32         char *c2;
33
34         memcpy(token, tokenValue, tokenLen);
35         token[tokenLen] = 0;
36
37         if (strcmp(token, EMPTYFIELD) == 0)
38         {
39             strcpy(token, "");
40         }
41
42         // Escape double quotes, and remove simple quotes
43         token2 = (char *) MALLOC((sizeof(char) * tokenLen) + 1);
44         c = token;
45         c_end = c + tokenLen;
46         c2 = token2;
47         while (c < c_end)
48         {
49             if (*c == DOUBLE_QUOTE)
50             {
51                 c++;
52                 if (*c == DOUBLE_QUOTE)
53                 {
54                     *c2 = DOUBLE_QUOTE;
55                     c++;
56                     c2++;
57                 }
58             }
59             else
60             {
61                 *c2 = *c;
62                 c++;
63                 c2++;
64             }
65         }
66         *c2 = 0;
67
68         // Add token
69         tokens[*tokenIdx] = token2;
70         (*tokenIdx)++;
71
72         FREE(token);
73
74         return TRUE;
75     }
76     return FALSE;
77 }
78
79 /* ==================================================================== */
80 char **splitLineCSV(const char *str, const char *sep, int *toks)
81 {
82     char **retstr = NULL;
83     const char *idx = NULL;
84     const char *end = NULL;
85     const char *sep_end = NULL;
86     const char *sep_idx = NULL;
87     int len = 0;
88     int curr_str = 0;
89     int inDoubleQuote = 0;
90
91     /* Usually, it should be ,, or ;; */
92     char tokenstring_to_search[64] = "";
93     /* Previous item will be replaced by ;__EMPTY_FIELD_CSV__; */
94     char tokenreplacement_string[64] = "";
95     char *substitutedstring = NULL;
96
97     sprintf(tokenstring_to_search, "%s%s", sep, sep);
98     sprintf(tokenreplacement_string, "%s%s%s", sep, EMPTYFIELD, sep);
99     substitutedstring = strsub(str, tokenstring_to_search, tokenreplacement_string);
100     /* in a string like foo;bar;;;, replace all the ;;, not only the first and last one */
101     while (strstr(substitutedstring, tokenstring_to_search) != NULL)
102     {
103         substitutedstring = strsub(substitutedstring, tokenstring_to_search, tokenreplacement_string);
104     }
105
106     if (strncmp(substitutedstring, sep, strlen(sep)) == 0)
107     {
108         char *tmp = NULL;
109         size_t l = strlen(substitutedstring) + strlen(EMPTYFIELD) + strlen(sep) + 1;
110         tmp = (char*)MALLOC(sizeof(char) * l);
111         sprintf(tmp, "%s%s%s", EMPTYFIELD, sep, &substitutedstring[1]);
112         FREE(substitutedstring);
113         substitutedstring = tmp;
114     }
115
116     if (substitutedstring[strlen(substitutedstring) - 1] == sep[0])
117     {
118         char *tmp = NULL;
119         size_t l = strlen(substitutedstring) + strlen(EMPTYFIELD) + 1;
120         tmp = (char*)MALLOC(sizeof(char) * l);
121         sprintf(tmp, "%s%s", substitutedstring, EMPTYFIELD);
122         FREE(substitutedstring);
123         substitutedstring = tmp;
124     }
125
126     sep_end = sep + strlen(sep);
127     end = substitutedstring + strlen(substitutedstring);
128
129     idx = substitutedstring;
130
131     if (strstr(substitutedstring, sep) == NULL)
132     {
133         *toks = 0;
134         FREE(substitutedstring);
135         return NULL;
136     }
137
138     retstr = (char **) MALLOC((sizeof(char *) * (int)strlen(substitutedstring)));
139     if (retstr == NULL)
140     {
141         *toks = 0;
142         FREE(substitutedstring);
143         return NULL;
144     }
145
146     while (idx < end)
147     {
148         // If we are in a double quoted field, we do not plit on separators
149         if (!inDoubleQuote)
150         {
151             sep_idx = sep;
152             while (sep_idx < sep_end)
153             {
154                 if ((*idx == *sep_idx))
155                 {
156                     if (len > 0)
157                     {
158                         if (curr_str < (int)strlen(substitutedstring))
159                         {
160                             // New token (= field)
161                             if (addToken(retstr, &curr_str, (char*)(idx - len), len))
162                             {
163                                 // Reset for next field
164                                 len = 0;
165                                 idx++;
166                             }
167                             else
168                             {
169                                 *toks = 0;
170                                 FREE(substitutedstring);
171                                 freeArrayOfString(retstr, strlen(substitutedstring));
172                                 return NULL;
173                             }
174                         }
175
176                         if (curr_str >= (int)strlen(substitutedstring))
177                         {
178                             *toks = curr_str + 1;
179                             FREE(substitutedstring);
180                             return retstr;
181                         }
182                     }
183                     else
184                     {
185                         idx++;
186                         len = 0;
187                     }
188                 }
189                 else
190                 {
191                     sep_idx++;
192                 }
193             }
194         }
195
196         if (*idx == '"')
197         {
198             // Count number of consecutive double quotes
199             int nbDoubleQuotes = 0;
200             const char *idxTmp = idx;
201
202             while (*idxTmp == '"')
203             {
204                 *idxTmp++;
205             }
206             nbDoubleQuotes = idxTmp - idx;
207
208             // if it is odd, we enter or leave a double quoted field
209             if (nbDoubleQuotes % 2 == 1)
210             {
211                 inDoubleQuote = (inDoubleQuote == 0) ? 1 : 0;
212             }
213             len += nbDoubleQuotes;
214             idx += nbDoubleQuotes;
215         }
216         else
217         {
218             len++;
219             idx++;
220         }
221     }
222
223     if (len > 0)
224     {
225         // New token (= field)
226         if (!addToken(retstr, &curr_str, (char*)(idx - len), len))
227         {
228             *toks = 0;
229             FREE(substitutedstring);
230             freeArrayOfString(retstr, strlen(substitutedstring));
231             return NULL;
232         }
233     }
234
235     *toks = curr_str;
236
237     FREE(substitutedstring);
238
239     return retstr;
240 }
241 /* ==================================================================== */