Commit 1428bd45 1428bd4560d2e740c4941a0f8121c674c2ee7500 by Sergey Poznyakoff

Support for multibyte-encoding in MH.

* gnulib.modules (mbchar, mbiter, mbslen): New modules.
* mh/mh.h (mh_charset): New function.
* mh/mh_format.c (DFLWIDTH): New macro.
(mbsubstrlen, mbsnlen): New functions.
(compress_ws): Handle multibyte strings.
(put_string): Use number of characters, not
octets, to update ind.
(print_hdr_segment): Handle multibyte strings.
(print_simple_segment, print_fmt_string): Likewise.
(reset_fmt_defaults): Restore default WS compression.
(mh_format): Set LC_TYPE based on the settings of
profile variables Charset and LC_BASE (new variable).
* mh/mh_init.c (mh_charset): New function.
(mh_decode_2047): Use mh_charset.
1 parent d39f4c2d
...@@ -14,6 +14,9 @@ gitlog-to-changelog ...@@ -14,6 +14,9 @@ gitlog-to-changelog
14 intprops 14 intprops
15 inttostr 15 inttostr
16 malloc 16 malloc
17 mbchar
18 mbiter
19 mbslen
17 mbswidth 20 mbswidth
18 obstack 21 obstack
19 realloc 22 realloc
......
...@@ -334,6 +334,7 @@ char *mh_create_message_id (int); ...@@ -334,6 +334,7 @@ char *mh_create_message_id (int);
334 int mh_whom (const char *filename, int check); 334 int mh_whom (const char *filename, int check);
335 void mh_set_reply_regex (const char *str); 335 void mh_set_reply_regex (const char *str);
336 int mh_decode_2047 (char *text, char **decoded_text); 336 int mh_decode_2047 (char *text, char **decoded_text);
337 const char *mh_charset (const char *);
337 338
338 int mh_alias_read (char *name, int fail); 339 int mh_alias_read (char *name, int fail);
339 int mh_alias_get (const char *name, mu_list_t *return_list); 340 int mh_alias_get (const char *name, mu_list_t *return_list);
......
...@@ -26,9 +26,15 @@ ...@@ -26,9 +26,15 @@
26 #ifdef HAVE_STRINGS_H 26 #ifdef HAVE_STRINGS_H
27 # include <strings.h> 27 # include <strings.h>
28 #endif 28 #endif
29 #include <string.h>
30 #include "mbiter.h"
31 #include "mbchar.h"
32 #include "mbswidth.h"
29 33
30 static char *_get_builtin_name (mh_builtin_fp ptr); 34 static char *_get_builtin_name (mh_builtin_fp ptr);
31 35
36 #define DFLWIDTH(mach) ((mach)->width - (mach)->ind)
37
32 /* Functions for handling string objects. */ 38 /* Functions for handling string objects. */
33 39
34 void 40 void
...@@ -106,17 +112,53 @@ strobj_realloc (strobj_t *obj, size_t length) ...@@ -106,17 +112,53 @@ strobj_realloc (strobj_t *obj, size_t length)
106 } 112 }
107 } 113 }
108 114
109 /* Compress whitespace in a string */ 115 /* Return the length (number of octets) of a substring of
116 string STR of length LEN, such that it contains NCOL
117 multibyte characters. */
118 int
119 mbsubstrlen (char *str, size_t len, size_t ncol)
120 {
121 int ret = 0;
122 mbi_iterator_t iter;
123
124 if (ncol <= 0)
125 return 0;
126
127 for (mbi_init (iter, str, len);
128 ncol && mbi_avail (iter);
129 ncol--, mbi_advance (iter))
130 ret += mb_len (mbi_cur (iter));
131 return ret;
132 }
133
134 /* Return the number of multibyte characters in the first LEN bytes
135 of character string STRING. */
136 size_t
137 mbsnlen (char *str, size_t len)
138 {
139 int ret = 0;
140 mbi_iterator_t iter;
141
142 for (mbi_init (iter, str, len); mbi_avail (iter); mbi_advance (iter))
143 ret++;
144 return ret;
145 }
146
147 /* Compress whitespace in a string (multi-byte) */
110 static void 148 static void
111 compress_ws (char *str, size_t *size) 149 compress_ws (char *str, size_t *psize)
112 { 150 {
113 unsigned char *p, *q; 151 unsigned char *p, *q;
114 size_t len = *size; 152 size_t size = *psize;
153 mbi_iterator_t iter;
115 int space = 0; 154 int space = 0;
116 155
117 for (p = q = (unsigned char*) str; len; len--, q++) 156 for (p = q = (unsigned char*) str,
157 mbi_init (iter, str, size);
158 mbi_avail (iter);
159 mbi_advance (iter))
118 { 160 {
119 if (isspace (*q)) 161 if (mb_isspace (mbi_cur (iter)))
120 { 162 {
121 if (!space) 163 if (!space)
122 *p++ = ' '; 164 *p++ = ' ';
...@@ -126,11 +168,15 @@ compress_ws (char *str, size_t *size) ...@@ -126,11 +168,15 @@ compress_ws (char *str, size_t *size)
126 else if (space) 168 else if (space)
127 space = 0; 169 space = 0;
128 170
129 if (isprint (*q)) 171 if (mb_isprint (mbi_cur (iter)))
130 *p++ = *q; 172 {
173 size_t len = mb_len (mbi_cur (iter));
174 memcpy (p, mb_ptr (mbi_cur (iter)), len);
175 p += len;
176 }
131 } 177 }
132 *p = 0; 178 *p = 0;
133 *size = p - (unsigned char*) str; 179 *psize = p - (unsigned char*) str;
134 } 180 }
135 181
136 #define COMPRESS_WS(mach, str, size) \ 182 #define COMPRESS_WS(mach, str, size) \
...@@ -145,8 +191,9 @@ static void ...@@ -145,8 +191,9 @@ static void
145 put_string (struct mh_machine *mach, char *str, int len) 191 put_string (struct mh_machine *mach, char *str, int len)
146 { 192 {
147 if (len == 0) 193 if (len == 0)
148 len = strlen (str); 194 return;
149 obstack_grow (&mach->stk, str, len); 195 obstack_grow (&mach->stk, str, len);
196 len = mbsnwidth (str, len, 0);
150 mach->ind += len; 197 mach->ind += len;
151 } 198 }
152 199
...@@ -156,40 +203,46 @@ print_hdr_segment (struct mh_machine *mach, char *str, size_t len) ...@@ -156,40 +203,46 @@ print_hdr_segment (struct mh_machine *mach, char *str, size_t len)
156 if (!len) 203 if (!len)
157 len = strlen (str); 204 len = strlen (str);
158 205
159 if (len < mach->width) 206 if (mbsnlen (str, len) < mach->width)
160 put_string (mach, str, len); 207 put_string (mach, str, len);
161 else 208 else
162 { 209 {
163 char *endp = str + len; 210 while (1)
164
165 while (str < endp)
166 { 211 {
167 size_t rest; 212 mbi_iterator_t iter;
168 char *p; 213 size_t rest = DFLWIDTH (mach);
169 size_t size; 214 size_t width = mbsnlen (str, len);
215 size_t off, size;
170 216
171 size = endp - str; 217 if (width <= rest)
172 rest = mach->width - mach->ind;
173 if (size < rest)
174 { 218 {
175 put_string (mach, str, size); 219 put_string (mach, str, len);
176 break; 220 break;
177 } 221 }
178 222
179 for (p = str + rest - 1; p > str && !isspace (*p); p--) 223 size = off = 0;
180 ; 224 for (mbi_init (iter, str, len);
225 mbi_avail (iter);
226 mbi_advance (iter))
227 {
228 if (mb_isspace (mbi_cur (iter)))
229 off = size;
230 size += mb_len (mbi_cur (iter));
231 }
181 232
182 if (p > str) 233 if (off > 0)
183 { 234 {
184 put_string (mach, str, p - str); 235 put_string (mach, str, off);
185 put_string (mach, "\n\t", 0); 236 put_string (mach, "\n ", 9);
186 mach->ind = 8; 237 mach->ind = 8;
187 str = p; 238 str += off;
239 len -= off;
188 } 240 }
189 else 241 else
190 { 242 {
191 put_string (mach, str, size); 243 size = mbsubstrlen (str, len, rest);
192 str += len; 244 put_string (mach, str, len);
245 break;
193 } 246 }
194 } 247 }
195 } 248 }
...@@ -232,7 +285,7 @@ print_simple_segment (struct mh_machine *mach, size_t width, ...@@ -232,7 +285,7 @@ print_simple_segment (struct mh_machine *mach, size_t width,
232 if (!width) 285 if (!width)
233 width = mach->width; 286 width = mach->width;
234 287
235 rest = width - mach->ind; 288 rest = DFLWIDTH (mach);
236 if (rest == 0) 289 if (rest == 0)
237 { 290 {
238 if (len == 1 && str[0] == '\n') 291 if (len == 1 && str[0] == '\n')
...@@ -240,10 +293,7 @@ print_simple_segment (struct mh_machine *mach, size_t width, ...@@ -240,10 +293,7 @@ print_simple_segment (struct mh_machine *mach, size_t width,
240 return; 293 return;
241 } 294 }
242 295
243 if (len > rest) 296 put_string (mach, str, mbsubstrlen (str, len, rest));
244 len = rest;
245
246 put_string (mach, str, len);
247 } 297 }
248 298
249 static void 299 static void
...@@ -274,13 +324,21 @@ static void ...@@ -274,13 +324,21 @@ static void
274 print_fmt_string (struct mh_machine *mach, size_t fmtwidth, char *str) 324 print_fmt_string (struct mh_machine *mach, size_t fmtwidth, char *str)
275 { 325 {
276 size_t len = strlen (str); 326 size_t len = strlen (str);
277 if (len > fmtwidth) 327 size_t width = mbslen (str);
278 len = fmtwidth; 328
329 if (fmtwidth && width > fmtwidth)
330 {
331 len = mbsubstrlen (str, len, fmtwidth);
332 width = fmtwidth;
333 }
334 else
335 len = mbsubstrlen (str, len, DFLWIDTH (mach));
336
279 put_string (mach, str, len); 337 put_string (mach, str, len);
280 338
281 if (fmtwidth > len) 339 if (fmtwidth > width)
282 { 340 {
283 fmtwidth -= len; 341 fmtwidth -= width;
284 mach->ind += fmtwidth; 342 mach->ind += fmtwidth;
285 while (fmtwidth--) 343 while (fmtwidth--)
286 obstack_1grow (&mach->stk, ' '); 344 obstack_1grow (&mach->stk, ' ');
...@@ -293,7 +351,7 @@ reset_fmt_defaults (struct mh_machine *mach) ...@@ -293,7 +351,7 @@ reset_fmt_defaults (struct mh_machine *mach)
293 const char *p; 351 const char *p;
294 352
295 mach->fmtflags = 0; 353 mach->fmtflags = 0;
296 p = mh_global_profile_get ("Compress-WS", NULL); 354 p = mh_global_profile_get ("Compress-WS", "yes");
297 if (p && (mu_c_strcasecmp (p, "yes") == 0 355 if (p && (mu_c_strcasecmp (p, "yes") == 0
298 || mu_c_strcasecmp (p, "true") == 0)) 356 || mu_c_strcasecmp (p, "true") == 0))
299 mach->fmtflags |= MH_FMT_COMPWS; 357 mach->fmtflags |= MH_FMT_COMPWS;
...@@ -409,6 +467,7 @@ mh_format (mh_format_t *fmt, mu_message_t msg, size_t msgno, ...@@ -409,6 +467,7 @@ mh_format (mh_format_t *fmt, mu_message_t msg, size_t msgno,
409 { 467 {
410 struct mh_machine mach; 468 struct mh_machine mach;
411 char buf[64]; 469 char buf[64];
470 const char *charset = mh_global_profile_get ("Charset", NULL);
412 471
413 memset (&mach, 0, sizeof (mach)); 472 memset (&mach, 0, sizeof (mach));
414 mach.progsize = fmt->progsize; 473 mach.progsize = fmt->progsize;
...@@ -424,10 +483,37 @@ mh_format (mh_format_t *fmt, mu_message_t msg, size_t msgno, ...@@ -424,10 +483,37 @@ mh_format (mh_format_t *fmt, mu_message_t msg, size_t msgno,
424 483
425 reset_fmt_defaults (&mach); 484 reset_fmt_defaults (&mach);
426 485
427 while (!mach.stop) 486 #if HAVE_SETLOCALE
487 if (charset && strcmp (charset, "auto"))
488 {
489 /* Try to set LC_CTYPE according to the value of Charset variable.
490 If Charset is `auto', there's no need to do anything, since it
491 is already set. Otherwise, we need to construct a valid locale
492 value with Charset as its codeset part. The problem is, what
493 language and territory to use for that locale.
494
495 Neither LANG nor any other environment variable is of any use,
496 because if it were, the user would have set "Charset: auto".
497 It would be logical to use 'C' or 'POSIX', but these do not
498 work with '.UTF-8'. So, in the absence of any viable alternative,
499 'en_US' is selected. This choice may be overridden by setting
500 the LC_BASE mh_profile variable to the desired base part.
501 */
502 const char *lc_base = mh_global_profile_get ("LC_BASE", "en_US");
503 char *locale = xmalloc (strlen (lc_base) + 1 + strlen (charset) + 1);
504 strcpy (locale, lc_base);
505 strcat (locale, ".");
506 strcat (locale, charset);
507 if (!setlocale (LC_CTYPE, locale))
508 mu_error (_("cannot set LC_CTYPE %s"), locale);
509 free (locale);
510 }
511 #endif
512
513 while (!mach.stop && mach.ind < mach.width)
428 { 514 {
429 mh_opcode_t opcode; 515 mh_opcode_t opcode;
430 switch (opcode = MHI_OPCODE(mach.prog[mach.pc++])) 516 switch (opcode = MHI_OPCODE (mach.prog[mach.pc++]))
431 { 517 {
432 case mhop_nop: 518 case mhop_nop:
433 break; 519 break;
...@@ -437,7 +523,7 @@ mh_format (mh_format_t *fmt, mu_message_t msg, size_t msgno, ...@@ -437,7 +523,7 @@ mh_format (mh_format_t *fmt, mu_message_t msg, size_t msgno,
437 break; 523 break;
438 524
439 case mhop_branch: 525 case mhop_branch:
440 mach.pc += MHI_NUM(mach.prog[mach.pc]); 526 mach.pc += MHI_NUM (mach.prog[mach.pc]);
441 break; 527 break;
442 528
443 case mhop_num_asgn: 529 case mhop_num_asgn:
...@@ -449,27 +535,27 @@ mh_format (mh_format_t *fmt, mu_message_t msg, size_t msgno, ...@@ -449,27 +535,27 @@ mh_format (mh_format_t *fmt, mu_message_t msg, size_t msgno,
449 break; 535 break;
450 536
451 case mhop_num_arg: 537 case mhop_num_arg:
452 mach.arg_num = MHI_NUM(mach.prog[mach.pc++]); 538 mach.arg_num = MHI_NUM (mach.prog[mach.pc++]);
453 break; 539 break;
454 540
455 case mhop_str_arg: 541 case mhop_str_arg:
456 { 542 {
457 size_t skip = MHI_NUM(mach.prog[mach.pc++]); 543 size_t skip = MHI_NUM (mach.prog[mach.pc++]);
458 strobj_set (&mach.arg_str, MHI_STR(mach.prog[mach.pc])); 544 strobj_set (&mach.arg_str, MHI_STR (mach.prog[mach.pc]));
459 mach.pc += skip; 545 mach.pc += skip;
460 } 546 }
461 break; 547 break;
462 548
463 case mhop_num_branch: 549 case mhop_num_branch:
464 if (!mach.arg_num) 550 if (!mach.arg_num)
465 mach.pc += MHI_NUM(mach.prog[mach.pc]); 551 mach.pc += MHI_NUM (mach.prog[mach.pc]);
466 else 552 else
467 mach.pc++; 553 mach.pc++;
468 break; 554 break;
469 555
470 case mhop_str_branch: 556 case mhop_str_branch:
471 if (!*strobj_ptr (&mach.arg_str)) 557 if (!*strobj_ptr (&mach.arg_str))
472 mach.pc += MHI_NUM(mach.prog[mach.pc]); 558 mach.pc += MHI_NUM (mach.prog[mach.pc]);
473 else 559 else
474 mach.pc++; 560 mach.pc++;
475 break; 561 break;
...@@ -503,7 +589,7 @@ mh_format (mh_format_t *fmt, mu_message_t msg, size_t msgno, ...@@ -503,7 +589,7 @@ mh_format (mh_format_t *fmt, mu_message_t msg, size_t msgno,
503 mu_body_t body = NULL; 589 mu_body_t body = NULL;
504 mu_stream_t stream = NULL; 590 mu_stream_t stream = NULL;
505 size_t size = 0, off, str_off, nread; 591 size_t size = 0, off, str_off, nread;
506 size_t rest = mach.width - mach.ind; 592 size_t rest = DFLWIDTH (&mach);
507 593
508 strobj_free (&mach.arg_str); 594 strobj_free (&mach.arg_str);
509 mu_message_get_body (mach.message, &body); 595 mu_message_get_body (mach.message, &body);
...@@ -564,7 +650,7 @@ mh_format (mh_format_t *fmt, mu_message_t msg, size_t msgno, ...@@ -564,7 +650,7 @@ mh_format (mh_format_t *fmt, mu_message_t msg, size_t msgno,
564 break; 650 break;
565 651
566 case mhop_fmtspec: 652 case mhop_fmtspec:
567 mach.fmtflags = MHI_NUM(mach.prog[mach.pc++]); 653 mach.fmtflags = MHI_NUM (mach.prog[mach.pc++]);
568 break; 654 break;
569 655
570 default: 656 default:
...@@ -615,7 +701,7 @@ mh_format_dump (mh_format_t *fmt) ...@@ -615,7 +701,7 @@ mh_format_dump (mh_format_t *fmt)
615 int num; 701 int num;
616 702
617 printf ("% 4.4ld: ", (long) pc); 703 printf ("% 4.4ld: ", (long) pc);
618 switch (opcode = MHI_OPCODE(prog[pc++])) 704 switch (opcode = MHI_OPCODE (prog[pc++]))
619 { 705 {
620 case mhop_nop: 706 case mhop_nop:
621 printf ("nop"); 707 printf ("nop");
...@@ -627,7 +713,7 @@ mh_format_dump (mh_format_t *fmt) ...@@ -627,7 +713,7 @@ mh_format_dump (mh_format_t *fmt)
627 break; 713 break;
628 714
629 case mhop_branch: 715 case mhop_branch:
630 num = MHI_NUM(prog[pc++]); 716 num = MHI_NUM (prog[pc++]);
631 printf ("branch %d, %lu", 717 printf ("branch %d, %lu",
632 num, (unsigned long) pc + num - 1); 718 num, (unsigned long) pc + num - 1);
633 break; 719 break;
...@@ -641,14 +727,14 @@ mh_format_dump (mh_format_t *fmt) ...@@ -641,14 +727,14 @@ mh_format_dump (mh_format_t *fmt)
641 break; 727 break;
642 728
643 case mhop_num_arg: 729 case mhop_num_arg:
644 num = MHI_NUM(prog[pc++]); 730 num = MHI_NUM (prog[pc++]);
645 printf ("num_arg %d", num); 731 printf ("num_arg %d", num);
646 break; 732 break;
647 733
648 case mhop_str_arg: 734 case mhop_str_arg:
649 { 735 {
650 size_t skip = MHI_NUM(prog[pc++]); 736 size_t skip = MHI_NUM (prog[pc++]);
651 char *s = MHI_STR(prog[pc]); 737 char *s = MHI_STR (prog[pc]);
652 printf ("str_arg \""); 738 printf ("str_arg \"");
653 for (; *s; s++) 739 for (; *s; s++)
654 { 740 {
...@@ -696,13 +782,13 @@ mh_format_dump (mh_format_t *fmt) ...@@ -696,13 +782,13 @@ mh_format_dump (mh_format_t *fmt)
696 break; 782 break;
697 783
698 case mhop_num_branch: 784 case mhop_num_branch:
699 num = MHI_NUM(prog[pc++]); 785 num = MHI_NUM (prog[pc++]);
700 printf ("num_branch %d, %lu", 786 printf ("num_branch %d, %lu",
701 num, (unsigned long) (pc + num - 1)); 787 num, (unsigned long) (pc + num - 1));
702 break; 788 break;
703 789
704 case mhop_str_branch: 790 case mhop_str_branch:
705 num = MHI_NUM(prog[pc++]); 791 num = MHI_NUM (prog[pc++]);
706 printf ("str_branch %d, %lu", 792 printf ("str_branch %d, %lu",
707 num, (unsigned long) (pc + num - 1)); 793 num, (unsigned long) (pc + num - 1));
708 break; 794 break;
...@@ -753,7 +839,7 @@ mh_format_dump (mh_format_t *fmt) ...@@ -753,7 +839,7 @@ mh_format_dump (mh_format_t *fmt)
753 { 839 {
754 int space = 0; 840 int space = 0;
755 841
756 num = MHI_NUM(prog[pc++]); 842 num = MHI_NUM (prog[pc++]);
757 printf ("fmtspec: %#x, ", num); 843 printf ("fmtspec: %#x, ", num);
758 if (num & MH_FMT_RALIGN) 844 if (num & MH_FMT_RALIGN)
759 { 845 {
...@@ -840,7 +926,7 @@ builtin_width (struct mh_machine *mach) ...@@ -840,7 +926,7 @@ builtin_width (struct mh_machine *mach)
840 static void 926 static void
841 builtin_charleft (struct mh_machine *mach) 927 builtin_charleft (struct mh_machine *mach)
842 { 928 {
843 mach->arg_num = mach->width - mach->ind; 929 mach->arg_num = DFLWIDTH (mach);
844 } 930 }
845 931
846 static void 932 static void
......
...@@ -870,13 +870,13 @@ mh_set_reply_regex (const char *str) ...@@ -870,13 +870,13 @@ mh_set_reply_regex (const char *str)
870 err ? err : ""); 870 err ? err : "");
871 } 871 }
872 872
873 int 873 const char *
874 mh_decode_2047 (char *text, char **decoded_text) 874 mh_charset (const char *dfl)
875 { 875 {
876 const char *charset = mh_global_profile_get ("Charset", NULL); 876 const char *charset = mh_global_profile_get ("Charset", dfl);
877 877
878 if (!charset) 878 if (!charset)
879 return 1; 879 return NULL;
880 if (mu_c_strcasecmp (charset, "auto") == 0) 880 if (mu_c_strcasecmp (charset, "auto") == 0)
881 { 881 {
882 /* Try to deduce the charset from LC_ALL variable */ 882 /* Try to deduce the charset from LC_ALL variable */
...@@ -899,7 +899,13 @@ mh_decode_2047 (char *text, char **decoded_text) ...@@ -899,7 +899,13 @@ mh_decode_2047 (char *text, char **decoded_text)
899 free (tmp); 899 free (tmp);
900 } 900 }
901 } 901 }
902 return charset;
903 }
902 904
905 int
906 mh_decode_2047 (char *text, char **decoded_text)
907 {
908 const char *charset = mh_charset (NULL);
903 if (!charset) 909 if (!charset)
904 return 1; 910 return 1;
905 911
......