summaryrefslogtreecommitdiff
path: root/input.c
diff options
context:
space:
mode:
authorSimon Tatham <anakin@pobox.com>2004-04-19 17:09:49 +0000
committerSimon Tatham <anakin@pobox.com>2004-04-19 17:09:49 +0000
commit8a9d3f97956db97b0813a6d24c486371ff14bd80 (patch)
treea080b34a57cee86398a0f6a181fe80a0a869eae9 /input.c
parentf42941536c6c16ba8b89dd5f25d8a747e3d5495d (diff)
downloadhalibut-8a9d3f97956db97b0813a6d24c486371ff14bd80.zip
halibut-8a9d3f97956db97b0813a6d24c486371ff14bd80.tar.gz
halibut-8a9d3f97956db97b0813a6d24c486371ff14bd80.tar.bz2
halibut-8a9d3f97956db97b0813a6d24c486371ff14bd80.tar.xz
Support for \cfg{input-charset}. Input files can now be in ASCII,
8859-*, UTF-8, or a variety of more fun encodings including various multibyte ones. [originally from svn r4095]
Diffstat (limited to 'input.c')
-rw-r--r--input.c97
1 files changed, 70 insertions, 27 deletions
diff --git a/input.c b/input.c
index efce410..d607e86 100644
--- a/input.c
+++ b/input.c
@@ -82,6 +82,16 @@ static void macrocleanup(tree234 *macros) {
freetree234(macros);
}
+static void input_configure(input *in, paragraph *cfg) {
+ assert(cfg->type == para_Config);
+
+ if (!ustricmp(cfg->keyword, L"input-charset")) {
+ char *csname = utoa_dup(uadv(cfg->keyword));
+ in->charset = charset_from_localenc(csname);
+ sfree(csname);
+ }
+}
+
/*
* Can return EOF
*/
@@ -103,36 +113,63 @@ static int get(input *in, filepos *pos) {
return c;
}
else if (in->currfp) {
- int c = getc(in->currfp);
- if (c == EOF) {
- fclose(in->currfp);
- in->currfp = NULL;
- }
- /* Track line numbers, for error reporting */
- if (pos)
- *pos = in->pos;
- if (in->reportcols) {
- switch (c) {
- case '\t':
- in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
- break;
- case '\n':
- in->pos.col = 1;
- in->pos.line++;
- break;
- default:
- in->pos.col++;
- break;
+ while (in->wcpos >= in->nwc) {
+
+ int c = getc(in->currfp);
+
+ if (c == EOF) {
+ fclose(in->currfp);
+ in->currfp = NULL;
+ return EOF;
+ }
+ /* Track line numbers, for error reporting */
+ if (pos)
+ *pos = in->pos;
+ if (in->reportcols) {
+ switch (c) {
+ case '\t':
+ in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
+ break;
+ case '\n':
+ in->pos.col = 1;
+ in->pos.line++;
+ break;
+ default:
+ in->pos.col++;
+ break;
+ }
+ } else {
+ in->pos.col = -1;
+ if (c == '\n')
+ in->pos.line++;
+ }
+
+ /*
+ * Do input character set translation, so that we return
+ * Unicode.
+ */
+ {
+ char buf[1];
+ char const *p;
+ int inlen;
+
+ buf[0] = (char)c;
+ p = buf;
+ inlen = 1;
+
+ in->nwc = charset_to_unicode(&p, &inlen,
+ in->wc, lenof(in->wc),
+ in->charset, &in->csstate,
+ NULL, 0);
+ assert(p == buf+1 && inlen == 0);
+
+ in->wcpos = 0;
}
- } else {
- in->pos.col = -1;
- if (c == '\n')
- in->pos.line++;
}
- /* FIXME: do input charmap translation. We should be returning
- * Unicode here. */
- return c;
+
+ return in->wc[in->wcpos++];
+
} else
return EOF;
}
@@ -884,6 +921,10 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) {
already = TRUE;/* inhibit get_token at top of loop */
prev_para_type = par.type;
addpara(par, ret);
+
+ if (par.type == para_Config) {
+ input_configure(in, &par);
+ }
continue; /* next paragraph */
}
}
@@ -1421,6 +1462,8 @@ paragraph *read_input(input *in, indexdata *idx) {
in->currfp = fopen(in->filenames[in->currindex], "r");
if (in->currfp) {
setpos(in, in->filenames[in->currindex]);
+ in->charset = in->defcharset;
+ in->csstate = charset_init_state;
read_file(&hptr, in, idx);
}
in->currindex++;