stripspace: call U+0020 a "space" instead of a "blank"
[git] / ws.c
1 /*
2  * Whitespace rules
3  *
4  * Copyright (c) 2007 Junio C Hamano
5  */
6
7 #include "cache.h"
8 #include "attr.h"
9
10 static struct whitespace_rule {
11         const char *rule_name;
12         unsigned rule_bits;
13         unsigned loosens_error:1,
14                 exclude_default:1;
15 } whitespace_rule_names[] = {
16         { "trailing-space", WS_TRAILING_SPACE, 0 },
17         { "space-before-tab", WS_SPACE_BEFORE_TAB, 0 },
18         { "indent-with-non-tab", WS_INDENT_WITH_NON_TAB, 0 },
19         { "cr-at-eol", WS_CR_AT_EOL, 1 },
20         { "blank-at-eol", WS_BLANK_AT_EOL, 0 },
21         { "blank-at-eof", WS_BLANK_AT_EOF, 0 },
22         { "tab-in-indent", WS_TAB_IN_INDENT, 0, 1 },
23 };
24
25 unsigned parse_whitespace_rule(const char *string)
26 {
27         unsigned rule = WS_DEFAULT_RULE;
28
29         while (string) {
30                 int i;
31                 size_t len;
32                 const char *ep;
33                 int negated = 0;
34
35                 string = string + strspn(string, ", \t\n\r");
36                 ep = strchrnul(string, ',');
37                 len = ep - string;
38
39                 if (*string == '-') {
40                         negated = 1;
41                         string++;
42                         len--;
43                 }
44                 if (!len)
45                         break;
46                 for (i = 0; i < ARRAY_SIZE(whitespace_rule_names); i++) {
47                         if (strncmp(whitespace_rule_names[i].rule_name,
48                                     string, len))
49                                 continue;
50                         if (negated)
51                                 rule &= ~whitespace_rule_names[i].rule_bits;
52                         else
53                                 rule |= whitespace_rule_names[i].rule_bits;
54                         break;
55                 }
56                 if (strncmp(string, "tabwidth=", 9) == 0) {
57                         unsigned tabwidth = atoi(string + 9);
58                         if (0 < tabwidth && tabwidth < 0100) {
59                                 rule &= ~WS_TAB_WIDTH_MASK;
60                                 rule |= tabwidth;
61                         }
62                         else
63                                 warning("tabwidth %.*s out of range",
64                                         (int)(len - 9), string + 9);
65                 }
66                 string = ep;
67         }
68
69         if (rule & WS_TAB_IN_INDENT && rule & WS_INDENT_WITH_NON_TAB)
70                 die("cannot enforce both tab-in-indent and indent-with-non-tab");
71         return rule;
72 }
73
74 static void setup_whitespace_attr_check(struct git_attr_check *check)
75 {
76         static struct git_attr *attr_whitespace;
77
78         if (!attr_whitespace)
79                 attr_whitespace = git_attr("whitespace");
80         check[0].attr = attr_whitespace;
81 }
82
83 unsigned whitespace_rule(const char *pathname)
84 {
85         struct git_attr_check attr_whitespace_rule;
86
87         setup_whitespace_attr_check(&attr_whitespace_rule);
88         if (!git_check_attr(pathname, 1, &attr_whitespace_rule)) {
89                 const char *value;
90
91                 value = attr_whitespace_rule.value;
92                 if (ATTR_TRUE(value)) {
93                         /* true (whitespace) */
94                         unsigned all_rule = ws_tab_width(whitespace_rule_cfg);
95                         int i;
96                         for (i = 0; i < ARRAY_SIZE(whitespace_rule_names); i++)
97                                 if (!whitespace_rule_names[i].loosens_error &&
98                                     !whitespace_rule_names[i].exclude_default)
99                                         all_rule |= whitespace_rule_names[i].rule_bits;
100                         return all_rule;
101                 } else if (ATTR_FALSE(value)) {
102                         /* false (-whitespace) */
103                         return ws_tab_width(whitespace_rule_cfg);
104                 } else if (ATTR_UNSET(value)) {
105                         /* reset to default (!whitespace) */
106                         return whitespace_rule_cfg;
107                 } else {
108                         /* string */
109                         return parse_whitespace_rule(value);
110                 }
111         } else {
112                 return whitespace_rule_cfg;
113         }
114 }
115
116 /* The returned string should be freed by the caller. */
117 char *whitespace_error_string(unsigned ws)
118 {
119         struct strbuf err = STRBUF_INIT;
120         if ((ws & WS_TRAILING_SPACE) == WS_TRAILING_SPACE)
121                 strbuf_addstr(&err, "trailing whitespace");
122         else {
123                 if (ws & WS_BLANK_AT_EOL)
124                         strbuf_addstr(&err, "trailing whitespace");
125                 if (ws & WS_BLANK_AT_EOF) {
126                         if (err.len)
127                                 strbuf_addstr(&err, ", ");
128                         strbuf_addstr(&err, "new blank line at EOF");
129                 }
130         }
131         if (ws & WS_SPACE_BEFORE_TAB) {
132                 if (err.len)
133                         strbuf_addstr(&err, ", ");
134                 strbuf_addstr(&err, "space before tab in indent");
135         }
136         if (ws & WS_INDENT_WITH_NON_TAB) {
137                 if (err.len)
138                         strbuf_addstr(&err, ", ");
139                 strbuf_addstr(&err, "indent with spaces");
140         }
141         if (ws & WS_TAB_IN_INDENT) {
142                 if (err.len)
143                         strbuf_addstr(&err, ", ");
144                 strbuf_addstr(&err, "tab in indent");
145         }
146         return strbuf_detach(&err, NULL);
147 }
148
149 /* If stream is non-NULL, emits the line after checking. */
150 static unsigned ws_check_emit_1(const char *line, int len, unsigned ws_rule,
151                                 FILE *stream, const char *set,
152                                 const char *reset, const char *ws)
153 {
154         unsigned result = 0;
155         int written = 0;
156         int trailing_whitespace = -1;
157         int trailing_newline = 0;
158         int trailing_carriage_return = 0;
159         int i;
160
161         /* Logic is simpler if we temporarily ignore the trailing newline. */
162         if (len > 0 && line[len - 1] == '\n') {
163                 trailing_newline = 1;
164                 len--;
165         }
166         if ((ws_rule & WS_CR_AT_EOL) &&
167             len > 0 && line[len - 1] == '\r') {
168                 trailing_carriage_return = 1;
169                 len--;
170         }
171
172         /* Check for trailing whitespace. */
173         if (ws_rule & WS_BLANK_AT_EOL) {
174                 for (i = len - 1; i >= 0; i--) {
175                         if (isspace(line[i])) {
176                                 trailing_whitespace = i;
177                                 result |= WS_BLANK_AT_EOL;
178                         }
179                         else
180                                 break;
181                 }
182         }
183
184         if (trailing_whitespace == -1)
185                 trailing_whitespace = len;
186
187         /* Check indentation */
188         for (i = 0; i < trailing_whitespace; i++) {
189                 if (line[i] == ' ')
190                         continue;
191                 if (line[i] != '\t')
192                         break;
193                 if ((ws_rule & WS_SPACE_BEFORE_TAB) && written < i) {
194                         result |= WS_SPACE_BEFORE_TAB;
195                         if (stream) {
196                                 fputs(ws, stream);
197                                 fwrite(line + written, i - written, 1, stream);
198                                 fputs(reset, stream);
199                                 fwrite(line + i, 1, 1, stream);
200                         }
201                 } else if (ws_rule & WS_TAB_IN_INDENT) {
202                         result |= WS_TAB_IN_INDENT;
203                         if (stream) {
204                                 fwrite(line + written, i - written, 1, stream);
205                                 fputs(ws, stream);
206                                 fwrite(line + i, 1, 1, stream);
207                                 fputs(reset, stream);
208                         }
209                 } else if (stream) {
210                         fwrite(line + written, i - written + 1, 1, stream);
211                 }
212                 written = i + 1;
213         }
214
215         /* Check for indent using non-tab. */
216         if ((ws_rule & WS_INDENT_WITH_NON_TAB) && i - written >= ws_tab_width(ws_rule)) {
217                 result |= WS_INDENT_WITH_NON_TAB;
218                 if (stream) {
219                         fputs(ws, stream);
220                         fwrite(line + written, i - written, 1, stream);
221                         fputs(reset, stream);
222                 }
223                 written = i;
224         }
225
226         if (stream) {
227                 /*
228                  * Now the rest of the line starts at "written".
229                  * The non-highlighted part ends at "trailing_whitespace".
230                  */
231
232                 /* Emit non-highlighted (middle) segment. */
233                 if (trailing_whitespace - written > 0) {
234                         fputs(set, stream);
235                         fwrite(line + written,
236                             trailing_whitespace - written, 1, stream);
237                         fputs(reset, stream);
238                 }
239
240                 /* Highlight errors in trailing whitespace. */
241                 if (trailing_whitespace != len) {
242                         fputs(ws, stream);
243                         fwrite(line + trailing_whitespace,
244                             len - trailing_whitespace, 1, stream);
245                         fputs(reset, stream);
246                 }
247                 if (trailing_carriage_return)
248                         fputc('\r', stream);
249                 if (trailing_newline)
250                         fputc('\n', stream);
251         }
252         return result;
253 }
254
255 void ws_check_emit(const char *line, int len, unsigned ws_rule,
256                    FILE *stream, const char *set,
257                    const char *reset, const char *ws)
258 {
259         (void)ws_check_emit_1(line, len, ws_rule, stream, set, reset, ws);
260 }
261
262 unsigned ws_check(const char *line, int len, unsigned ws_rule)
263 {
264         return ws_check_emit_1(line, len, ws_rule, NULL, NULL, NULL, NULL);
265 }
266
267 int ws_blank_line(const char *line, int len, unsigned ws_rule)
268 {
269         /*
270          * We _might_ want to treat CR differently from other
271          * whitespace characters when ws_rule has WS_CR_AT_EOL, but
272          * for now we just use this stupid definition.
273          */
274         while (len-- > 0) {
275                 if (!isspace(*line))
276                         return 0;
277                 line++;
278         }
279         return 1;
280 }
281
282 /* Copy the line onto the end of the strbuf while fixing whitespaces */
283 void ws_fix_copy(struct strbuf *dst, const char *src, int len, unsigned ws_rule, int *error_count)
284 {
285         /*
286          * len is number of bytes to be copied from src, starting
287          * at src.  Typically src[len-1] is '\n', unless this is
288          * the incomplete last line.
289          */
290         int i;
291         int add_nl_to_tail = 0;
292         int add_cr_to_tail = 0;
293         int fixed = 0;
294         int last_tab_in_indent = -1;
295         int last_space_in_indent = -1;
296         int need_fix_leading_space = 0;
297
298         /*
299          * Strip trailing whitespace
300          */
301         if (ws_rule & WS_BLANK_AT_EOL) {
302                 if (0 < len && src[len - 1] == '\n') {
303                         add_nl_to_tail = 1;
304                         len--;
305                         if (0 < len && src[len - 1] == '\r') {
306                                 add_cr_to_tail = !!(ws_rule & WS_CR_AT_EOL);
307                                 len--;
308                         }
309                 }
310                 if (0 < len && isspace(src[len - 1])) {
311                         while (0 < len && isspace(src[len-1]))
312                                 len--;
313                         fixed = 1;
314                 }
315         }
316
317         /*
318          * Check leading whitespaces (indent)
319          */
320         for (i = 0; i < len; i++) {
321                 char ch = src[i];
322                 if (ch == '\t') {
323                         last_tab_in_indent = i;
324                         if ((ws_rule & WS_SPACE_BEFORE_TAB) &&
325                             0 <= last_space_in_indent)
326                             need_fix_leading_space = 1;
327                 } else if (ch == ' ') {
328                         last_space_in_indent = i;
329                         if ((ws_rule & WS_INDENT_WITH_NON_TAB) &&
330                             ws_tab_width(ws_rule) <= i - last_tab_in_indent)
331                                 need_fix_leading_space = 1;
332                 } else
333                         break;
334         }
335
336         if (need_fix_leading_space) {
337                 /* Process indent ourselves */
338                 int consecutive_spaces = 0;
339                 int last = last_tab_in_indent + 1;
340
341                 if (ws_rule & WS_INDENT_WITH_NON_TAB) {
342                         /* have "last" point at one past the indent */
343                         if (last_tab_in_indent < last_space_in_indent)
344                                 last = last_space_in_indent + 1;
345                         else
346                                 last = last_tab_in_indent + 1;
347                 }
348
349                 /*
350                  * between src[0..last-1], strip the funny spaces,
351                  * updating them to tab as needed.
352                  */
353                 for (i = 0; i < last; i++) {
354                         char ch = src[i];
355                         if (ch != ' ') {
356                                 consecutive_spaces = 0;
357                                 strbuf_addch(dst, ch);
358                         } else {
359                                 consecutive_spaces++;
360                                 if (consecutive_spaces == ws_tab_width(ws_rule)) {
361                                         strbuf_addch(dst, '\t');
362                                         consecutive_spaces = 0;
363                                 }
364                         }
365                 }
366                 while (0 < consecutive_spaces--)
367                         strbuf_addch(dst, ' ');
368                 len -= last;
369                 src += last;
370                 fixed = 1;
371         } else if ((ws_rule & WS_TAB_IN_INDENT) && last_tab_in_indent >= 0) {
372                 /* Expand tabs into spaces */
373                 int start = dst->len;
374                 int last = last_tab_in_indent + 1;
375                 for (i = 0; i < last; i++) {
376                         if (src[i] == '\t')
377                                 do {
378                                         strbuf_addch(dst, ' ');
379                                 } while ((dst->len - start) % ws_tab_width(ws_rule));
380                         else
381                                 strbuf_addch(dst, src[i]);
382                 }
383                 len -= last;
384                 src += last;
385                 fixed = 1;
386         }
387
388         strbuf_add(dst, src, len);
389         if (add_cr_to_tail)
390                 strbuf_addch(dst, '\r');
391         if (add_nl_to_tail)
392                 strbuf_addch(dst, '\n');
393         if (fixed && error_count)
394                 (*error_count)++;
395 }