de77fb31f9dd992f5c75265414e7acb51d6939c5
[scilab.git] / scilab / modules / spreadsheet / src / c / splitLine.c
1 /*
2  * Scilab ( http://www.scilab.org/ ) - This file is part of Scilab
3  * Copyright (C) 2010-2011 - DIGITEO - Allan CORNET
4  *
5  * Copyright (C) 2012 - 2016 - Scilab Enterprises
6  *
7  * This file is hereby licensed under the terms of the GNU GPL v2.0,
8  * pursuant to article 5.3.4 of the CeCILL v.2.1.
9  * This file was originally licensed under the terms of the CeCILL v2.1,
10  * and continues to be available under such terms.
11  * For more information, see the COPYING file which you should have received
12  * along with this program.
13  *
14  */
15 #include <string.h>
16 #include <stdio.h>
17 #include "splitLine.h"
18 #include "strsubst.h"
19 #include "sci_malloc.h"
20 #include "freeArrayOfString.h"
21
22 #define EMPTYFIELD "__EMPTY_FIELD_CSV__"
23 #define DOUBLE_QUOTE '"'
24
25 // Add the token (string) to the array of tokens,
26 // and applies post processing (escape double quotes,...)
27 static int addToken(char **tokens, int *tokenIdx, const char* tokenValue, int tokenLen)
28 {
29     char *token = (char *) MALLOC((sizeof(char) * tokenLen) + 1);
30
31     if (token)
32     {
33         char *token2;
34         const char *c, *c_end;
35         char *c2;
36
37         memcpy(token, tokenValue, tokenLen);
38         token[tokenLen] = 0;
39
40         if (strcmp(token, EMPTYFIELD) == 0)
41         {
42             strcpy(token, "");
43         }
44
45         // Escape double quotes, and remove simple quotes
46         token2 = (char *) MALLOC((sizeof(char) * tokenLen) + 1);
47         c = token;
48         c_end = c + tokenLen;
49         c2 = token2;
50         while (c < c_end)
51         {
52             if (*c == DOUBLE_QUOTE)
53             {
54                 c++;
55                 if (*c == DOUBLE_QUOTE)
56                 {
57                     *c2 = DOUBLE_QUOTE;
58                     c++;
59                     c2++;
60                 }
61             }
62             else
63             {
64                 *c2 = *c;
65                 c++;
66                 c2++;
67             }
68         }
69         *c2 = 0;
70
71         // Add token
72         tokens[*tokenIdx] = token2;
73         (*tokenIdx)++;
74
75         FREE(token);
76
77         return TRUE;
78     }
79     return FALSE;
80 }
81
82 /* ==================================================================== */
83 char **splitLineCSV(const char *str, const char *sep, int *toks)
84 {
85     char **retstr = NULL;
86     const char *idx = NULL;
87     const char *end = NULL;
88     const char *sep_end = NULL;
89     const char *sep_idx = NULL;
90     int len = 0;
91     int curr_str = 0;
92     int inDoubleQuote = 0;
93
94     /* Usually, it should be ,, or ;; */
95     char tokenstring_to_search[64] = "";
96     /* Previous item will be replaced by ;__EMPTY_FIELD_CSV__; */
97     char tokenreplacement_string[64] = "";
98     char *substitutedstring = NULL;
99
100     sprintf(tokenstring_to_search, "%s%s", sep, sep);
101     sprintf(tokenreplacement_string, "%s%s%s", sep, EMPTYFIELD, sep);
102     substitutedstring = strsub(str, tokenstring_to_search, tokenreplacement_string);
103     /* in a string like foo;bar;;;, replace all the ;;, not only the first and last one */
104     while (strstr(substitutedstring, tokenstring_to_search) != NULL)
105     {
106         substitutedstring = strsub(substitutedstring, tokenstring_to_search, tokenreplacement_string);
107     }
108
109     if (strncmp(substitutedstring, sep, strlen(sep)) == 0)
110     {
111         char *tmp = NULL;
112         size_t l = strlen(substitutedstring) + strlen(EMPTYFIELD) + strlen(sep) + 1;
113         tmp = (char*)MALLOC(sizeof(char) * l);
114         sprintf(tmp, "%s%s%s", EMPTYFIELD, sep, &substitutedstring[1]);
115         FREE(substitutedstring);
116         substitutedstring = tmp;
117     }
118
119     if (substitutedstring[strlen(substitutedstring) - 1] == sep[0])
120     {
121         char *tmp = NULL;
122         size_t l = strlen(substitutedstring) + strlen(EMPTYFIELD) + 1;
123         tmp = (char*)MALLOC(sizeof(char) * l);
124         sprintf(tmp, "%s%s", substitutedstring, EMPTYFIELD);
125         FREE(substitutedstring);
126         substitutedstring = tmp;
127     }
128
129     sep_end = sep + strlen(sep);
130     end = substitutedstring + strlen(substitutedstring);
131
132     idx = substitutedstring;
133
134     if (strstr(substitutedstring, sep) == NULL)
135     {
136         *toks = 0;
137         FREE(substitutedstring);
138         return NULL;
139     }
140
141     retstr = (char **) MALLOC((sizeof(char *) * (int)strlen(substitutedstring)));
142     if (retstr == NULL)
143     {
144         *toks = 0;
145         FREE(substitutedstring);
146         return NULL;
147     }
148
149     while (idx < end)
150     {
151         // If we are in a double quoted field, we do not plit on separators
152         if (!inDoubleQuote)
153         {
154             sep_idx = sep;
155             while (sep_idx < sep_end)
156             {
157                 if ((*idx == *sep_idx))
158                 {
159                     if (len > 0)
160                     {
161                         if (curr_str < (int)strlen(substitutedstring))
162                         {
163                             // New token (= field)
164                             if (addToken(retstr, &curr_str, (char*)(idx - len), len))
165                             {
166                                 // Reset for next field
167                                 len = 0;
168                                 idx++;
169                             }
170                             else
171                             {
172                                 *toks = 0;
173                                 freeArrayOfString(retstr, (int)strlen(substitutedstring));
174                                 FREE(substitutedstring);
175                                 return NULL;
176                             }
177                         }
178
179                         if (curr_str >= (int)strlen(substitutedstring))
180                         {
181                             *toks = curr_str + 1;
182                             FREE(substitutedstring);
183                             return retstr;
184                         }
185                     }
186                     else
187                     {
188                         idx++;
189                         len = 0;
190                     }
191                 }
192                 else
193                 {
194                     sep_idx++;
195                 }
196             }
197         }
198
199         if (*idx == '"')
200         {
201             // Count number of consecutive double quotes
202             int nbDoubleQuotes = 0;
203             const char *idxTmp = idx;
204
205             while (*idxTmp == '"')
206             {
207                 idxTmp++;
208             }
209             nbDoubleQuotes = (int)(idxTmp - idx);
210
211             // if it is odd, we enter or leave a double quoted field
212             if (nbDoubleQuotes % 2 == 1)
213             {
214                 inDoubleQuote = (inDoubleQuote == 0) ? 1 : 0;
215             }
216             len += nbDoubleQuotes;
217             idx += nbDoubleQuotes;
218         }
219         else
220         {
221             len++;
222             idx++;
223         }
224     }
225
226     if (len > 0)
227     {
228         // New token (= field)
229         if (!addToken(retstr, &curr_str, (char*)(idx - len), len))
230         {
231             *toks = 0;
232             freeArrayOfString(retstr, (int)strlen(substitutedstring));
233             FREE(substitutedstring);
234             return NULL;
235         }
236     }
237
238     *toks = curr_str;
239
240     FREE(substitutedstring);
241
242     return retstr;
243 }
244 /* ==================================================================== */