Support for \cfg{input-charset}. Input files can now be in ASCII,

8859-*, UTF-8, or a variety of more fun encodings including various multibyte ones. [originally from svn r4095]
author: Simon Tatham <anakin@pobox.com> 2004-04-19 17:09:49 +0000
committer: Simon Tatham <anakin@pobox.com> 2004-04-19 17:09:49 +0000
commit: 8a9d3f97956db97b0813a6d24c486371ff14bd80 (patch)
tree: a080b34a57cee86398a0f6a181fe80a0a869eae9 /input.c
parent: f42941536c6c16ba8b89dd5f25d8a747e3d5495d (diff)
download: halibut-8a9d3f97956db97b0813a6d24c486371ff14bd80.zip
halibut-8a9d3f97956db97b0813a6d24c486371ff14bd80.tar.gz
halibut-8a9d3f97956db97b0813a6d24c486371ff14bd80.tar.bz2
halibut-8a9d3f97956db97b0813a6d24c486371ff14bd80.tar.xz
1 files changed, 70 insertions, 27 deletions
diff --git a/input.c b/input.c
index efce410..d607e86 100644
--- a/input.c
+++ b/input.c
@@ -82,6 +82,16 @@ static void macrocleanup(tree234 *macros) {
     freetree234(macros);
 }
 
+static void input_configure(input *in, paragraph *cfg) {
+    assert(cfg->type == para_Config);
+
+    if (!ustricmp(cfg->keyword, L"input-charset")) {
+	char *csname = utoa_dup(uadv(cfg->keyword));
+	in->charset = charset_from_localenc(csname);
+	sfree(csname);
+    }
+}
+
 /*
  * Can return EOF
  */
@@ -103,36 +113,63 @@ static int get(input *in, filepos *pos) {
 	return c;
     }
     else if (in->currfp) {
-	int c = getc(in->currfp);
 
-	if (c == EOF) {
-	    fclose(in->currfp);
-	    in->currfp = NULL;
-	}
-	/* Track line numbers, for error reporting */
-	if (pos)
-	    *pos = in->pos;
-	if (in->reportcols) {
-	    switch (c) {
-	      case '\t':
-		in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
-		break;
-	      case '\n':
-		in->pos.col = 1;
-		in->pos.line++;
-		break;
-	      default:
-		in->pos.col++;
-		break;
+	while (in->wcpos >= in->nwc) {
+
+	    int c = getc(in->currfp);
+
+	    if (c == EOF) {
+		fclose(in->currfp);
+		in->currfp = NULL;
+		return EOF;
+	    }
+	    /* Track line numbers, for error reporting */
+	    if (pos)
+		*pos = in->pos;
+	    if (in->reportcols) {
+		switch (c) {
+		  case '\t':
+		    in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
+		    break;
+		  case '\n':
+		    in->pos.col = 1;
+		    in->pos.line++;
+		    break;
+		  default:
+		    in->pos.col++;
+		    break;
+		}
+	    } else {
+		in->pos.col = -1;
+		if (c == '\n')
+		    in->pos.line++;
+	    }
+
+	    /*
+	     * Do input character set translation, so that we return
+	     * Unicode.
+	     */
+	    {
+		char buf[1];
+		char const *p;
+		int inlen;
+
+		buf[0] = (char)c;
+		p = buf;
+		inlen = 1;
+
+		in->nwc = charset_to_unicode(&p, &inlen,
+					     in->wc, lenof(in->wc),
+					     in->charset, &in->csstate,
+					     NULL, 0);
+		assert(p == buf+1 && inlen == 0);
+
+		in->wcpos = 0;
 	    }
-	} else {
-	    in->pos.col = -1;
-	    if (c == '\n')
-		in->pos.line++;
 	}
-	/* FIXME: do input charmap translation. We should be returning
-	 * Unicode here. */
-	return c;
+
+	return in->wc[in->wcpos++];
+
     } else
 	return EOF;
 }
@@ -884,6 +921,10 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) {
 			already = TRUE;/* inhibit get_token at top of loop */
 		    prev_para_type = par.type;
 		    addpara(par, ret);
+
+		    if (par.type == para_Config) {
+			input_configure(in, &par);
+		    }
 		    continue;	       /* next paragraph */
 		}
 	    }
@@ -1421,6 +1462,8 @@ paragraph *read_input(input *in, indexdata *idx) {
 	in->currfp = fopen(in->filenames[in->currindex], "r");
 	if (in->currfp) {
 	    setpos(in, in->filenames[in->currindex]);
+	    in->charset = in->defcharset;
+	    in->csstate = charset_init_state;
 	    read_file(&hptr, in, idx);
 	}
 	in->currindex++;
author	Simon Tatham <anakin@pobox.com>	2004-04-19 17:09:49 +0000
committer	Simon Tatham <anakin@pobox.com>	2004-04-19 17:09:49 +0000
commit	8a9d3f97956db97b0813a6d24c486371ff14bd80 (patch)
tree	a080b34a57cee86398a0f6a181fe80a0a869eae9 /input.c
parent	f42941536c6c16ba8b89dd5f25d8a747e3d5495d (diff)
download	halibut-8a9d3f97956db97b0813a6d24c486371ff14bd80.zip halibut-8a9d3f97956db97b0813a6d24c486371ff14bd80.tar.gz halibut-8a9d3f97956db97b0813a6d24c486371ff14bd80.tar.bz2 halibut-8a9d3f97956db97b0813a6d24c486371ff14bd80.tar.xz