diff options
| author | Simon Tatham <anakin@pobox.com> | 1999-07-31 18:44:53 +0000 |
|---|---|---|
| committer | Simon Tatham <anakin@pobox.com> | 1999-07-31 18:44:53 +0000 |
| commit | 0d14833a9c76c51cc7417d8fd60bec9d92714b8e (patch) | |
| tree | c0716d398e83bc746baad088d5dfc215d5fea483 | |
| parent | 4c8c2b256ed01563a98b4cd820dc8ffef30d7fc1 (diff) | |
| download | halibut-0d14833a9c76c51cc7417d8fd60bec9d92714b8e.zip halibut-0d14833a9c76c51cc7417d8fd60bec9d92714b8e.tar.gz halibut-0d14833a9c76c51cc7417d8fd60bec9d92714b8e.tar.bz2 halibut-0d14833a9c76c51cc7417d8fd60bec9d92714b8e.tar.xz | |
Further development work. Parser nearly finished
[originally from svn r187]
| -rw-r--r-- | Makefile | 11 | ||||
| -rw-r--r-- | buttress.h | 79 | ||||
| -rw-r--r-- | error.c | 77 | ||||
| -rw-r--r-- | input.c | 682 | ||||
| -rw-r--r-- | inputs/test.but | 8 | ||||
| -rw-r--r-- | main.c | 30 | ||||
| -rw-r--r-- | malloc.c | 26 | ||||
| -rw-r--r-- | misc.c | 42 | ||||
| -rw-r--r-- | misc/buttress.sl | 4 | ||||
| -rw-r--r-- | ustring.c | 47 |
10 files changed, 965 insertions, 41 deletions
@@ -11,10 +11,15 @@ endif all: @test -d $(BUILDDIR) || mkdir $(BUILDDIR) @make -C $(BUILDDIR) -f ../Makefile REALBUILD=yes +clean: + @test -d $(BUILDDIR) || mkdir $(BUILDDIR) + @make -C $(BUILDDIR) -f ../Makefile clean REALBUILD=yes else # The `real' makefile part. +CFLAGS += -Wall -W + ifdef RELEASE ifndef VERSION VERSION := $(RELEASE) @@ -34,7 +39,8 @@ endif SRC := ../ -MODULES := main malloc error help licence version +MODULES := main malloc ustring error help licence version misc +MODULES += input OBJECTS := $(addsuffix .o,$(MODULES)) DEPS := $(addsuffix .d,$(MODULES)) @@ -48,6 +54,9 @@ buttress: $(OBJECTS) version.o: FORCE $(CC) $(VDEF) -MD -c $(SRC)version.c +clean:: + rm -f *.o buttress core + FORCE: # phony target to force version.o to be rebuilt every time -include $(DEPS) @@ -1,6 +1,9 @@ #ifndef BUTTRESS_BUTTRESS_H #define BUTTRESS_BUTTRESS_H +#include <stdio.h> +#include <wchar.h> + #ifdef __GNUC__ #define NORETURN __attribute__((__noreturn__)) #endif @@ -16,10 +19,20 @@ * Structure tags */ typedef struct input_Tag input; +typedef struct filepos_Tag filepos; typedef struct paragraph_Tag paragraph; typedef struct word_Tag word; /* + * Data structure to hold a file name and index, a line and a + * column number, for reporting errors + */ +struct filepos_Tag { + char *filename; + int line, col; +}; + +/* * Data structure to hold all the file names etc for input */ #define INPUT_PUSHBACK_MAX 16 @@ -28,8 +41,9 @@ struct input_Tag { int nfiles; /* how many in the list */ FILE *currfp; /* the currently open one */ int currindex; /* which one is that in the list */ - char pushback[INPUT_PUSHBACK_MAX]; /* pushed-back input characters */ + int pushback[INPUT_PUSHBACK_MAX]; /* pushed-back input characters */ int npushback; + filepos pos; }; /* @@ -39,19 +53,27 @@ struct input_Tag { struct paragraph_Tag { paragraph *next; int type; - char *keyword; /* for IR, IA, and heading paras */ + wchar_t *keyword; /* for most special paragraphs */ word *words; /* list of words in paragraph */ -} paragraph; +}; enum { - para_IA, /* index alias */ - para_IR, /* index rewrite */ + para_IM, /* index merge */ + para_BR, /* bibliography rewrite */ para_Chapter, para_Appendix, + para_UnnumberedChapter, para_Heading, para_Subsect, para_Normal, + para_Biblio, para_Bullet, - para_Code + para_NumberedList, + para_Code, + para_Copyright, + para_Preamble, + para_NoCite, + para_Title, + para_VersionID }; /* @@ -60,14 +82,18 @@ enum { struct word_Tag { word *next; int type; - char *text; -} + wchar_t *text; +}; enum { word_Normal, word_Emph, - word_Code, - word_IndexRef /* always invisible */ -} + word_Code, /* monospaced; `quoted' in text */ + word_WeakCode, /* monospaced, normal in text */ + word_UpperXref, /* \K */ + word_LowerXref, /* \k */ + word_IndexRef, /* (always an invisible one) */ + word_WhiteSpace /* text is NULL or ignorable */ +}; /* * error.c @@ -79,6 +105,20 @@ enum { err_optnoarg, /* option `-%s' requires an argument */ err_nosuchopt, /* unrecognised option `-%s' */ err_noinput, /* no input files */ + err_cantopen, /* unable to open input file `%s' */ + err_nodata, /* no data in input files */ + err_brokencodepara, /* line in codepara didn't begin `\c' */ + err_kwunclosed, /* expected `}' after keyword */ + err_kwillegal, /* paragraph type expects no keyword */ + err_kwexpected, /* paragraph type expects a keyword */ + err_kwtoomany, /* paragraph type expects only 1 */ + err_bodyillegal, /* paragraph type expects only kws! */ + err_badmidcmd, /* invalid command in mid-para */ + err_unexbrace, /* unexpected brace */ + err_explbr, /* expected `{' after command */ + err_kwexprbr, /* expected `}' after cross-ref */ + err_missingrbrace, /* unclosed braces at end of para */ + err_nestedstyles /* unable to nest text styles */ }; /* @@ -89,6 +129,14 @@ void *srealloc(void *p, int size); void sfree(void *p); /* + * ustring.c + */ +wchar_t *ustrdup(wchar_t *s); +char *ustrtoa(wchar_t *s, char *outbuf, int size); +int ustrlen(wchar_t *s); +wchar_t *ustrcpy(wchar_t *dest, wchar_t *source); + +/* * help.c */ void help(void); @@ -106,6 +154,15 @@ void licence(void); const char *const version; /* + * misc.c + */ +typedef struct stackTag *stack; +stack stk_new(void); +void stk_free(stack); +void stk_push(stack, void *); +void *stk_pop(stack); + +/* * input.c */ paragraph *read_input(input *in); @@ -11,10 +11,14 @@ * Error flags */ #define PREFIX 0x0001 /* give `buttress:' prefix */ +#define FILEPOS 0x0002 /* give file position prefix */ static void do_error(int code, va_list ap) { char error[1024]; + char auxbuf[256]; char *sp; + wchar_t *wsp; + filepos fpos; int flags; switch(code) { @@ -36,10 +40,83 @@ static void do_error(int code, va_list ap) { sprintf(error, "no input files"); flags = PREFIX; break; + case err_cantopen: + sp = va_arg(ap, char *); + sprintf(error, "unable to open input file `%.200s'", sp); + flags = PREFIX; + break; + case err_nodata: /* no arguments */ + sprintf(error, "no data in input files"); + flags = PREFIX; + break; + case err_brokencodepara: + fpos = *va_arg(ap, filepos *); + sprintf(error, "every line of a code paragraph should begin `\\c'"); + flags = FILEPOS; + break; + case err_kwunclosed: + fpos = *va_arg(ap, filepos *); + sprintf(error, "expected `}' after paragraph keyword"); + flags = FILEPOS; + break; + case err_kwexpected: + fpos = *va_arg(ap, filepos *); + sprintf(error, "expected a paragraph keyword"); + flags = FILEPOS; + break; + case err_kwillegal: + fpos = *va_arg(ap, filepos *); + sprintf(error, "expected no paragraph keyword"); + flags = FILEPOS; + break; + case err_kwtoomany: + fpos = *va_arg(ap, filepos *); + sprintf(error, "expected only one paragraph keyword"); + flags = FILEPOS; + break; + case err_bodyillegal: + fpos = *va_arg(ap, filepos *); + sprintf(error, "expected no text after paragraph keyword"); + flags = FILEPOS; + break; + case err_badmidcmd: + wsp = va_arg(ap, wchar_t *); + sp = ustrtoa(wsp, auxbuf, sizeof(auxbuf)); + fpos = *va_arg(ap, filepos *); + sprintf(error, "command `%.200s' unexpected in mid-paragraph", sp); + flags = FILEPOS; + break; + case err_unexbrace: + fpos = *va_arg(ap, filepos *); + sprintf(error, "brace character unexpected in mid-paragraph"); + flags = FILEPOS; + break; + case err_explbr: + fpos = *va_arg(ap, filepos *); + sprintf(error, "expected `{' after command"); + flags = FILEPOS; + break; + case err_kwexprbr: + fpos = *va_arg(ap, filepos *); + sprintf(error, "expected `}' after cross-reference"); + flags = FILEPOS; + break; + case err_missingrbrace: + fpos = *va_arg(ap, filepos *); + sprintf(error, "unclosed braces at end of paragraph"); + flags = FILEPOS; + break; + case err_nestedstyles: + fpos = *va_arg(ap, filepos *); + sprintf(error, "unable to nest text styles"); + flags = FILEPOS; + break; } if (flags & PREFIX) fputs("buttress: ", stderr); + if (flags & FILEPOS) + fprintf(stderr, "%s:%d: ", fpos.filename, fpos.line); fputs(error, stderr); fputc('\n', stderr); } @@ -6,7 +6,9 @@ #include <assert.h> #include "buttress.h" -static void unget(input *in, char c) { +#define TAB_STOP 8 /* for column number tracking */ + +static void unget(input *in, int c) { assert(in->npushback < INPUT_PUSHBACK_MAX); in->pushback[in->npushback++] = c; } @@ -16,59 +18,697 @@ static void unget(input *in, char c) { */ static int get(input *in) { if (in->npushback) - return (unsigned char)in->pushback[--in->npushback]; + return in->pushback[--in->npushback]; else if (in->currfp) { int c = getc(in->currfp); if (c == EOF) { fclose(in->currfp); in->currfp = NULL; } + /* Track line numbers, for error reporting */ + switch (c) { + case '\t': + in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP; + break; + case '\n': + in->pos.col = 1; + in->pos.line++; + break; + default: + in->pos.col++; + break; + } + /* FIXME: do input charmap translation. We should be returning + * Unicode here. */ return c; } else return EOF; } /* + * Small routines to amalgamate a string from an input source. + */ +typedef struct tagRdstring rdstring; +struct tagRdstring { + int pos, size; + wchar_t *text; +}; +static void rdadd(rdstring *rs, wchar_t c) { + if (rs->pos >= rs->size-1) { + rs->size = rs->pos + 128; + rs->text = srealloc(rs->text, rs->size * sizeof(wchar_t)); + } + rs->text[rs->pos++] = c; + rs->text[rs->pos] = 0; +} +static void rdadds(rdstring *rs, wchar_t *p) { + int len = ustrlen(p); + if (rs->pos >= rs->size - len) { + rs->size = rs->pos + len + 128; + rs->text = srealloc(rs->text, rs->size * sizeof(wchar_t)); + } + ustrcpy(rs->text + rs->pos, p); + rs->pos += len; +} +static wchar_t *rdtrim(rdstring *rs) { + rs->text = srealloc(rs->text, (rs->pos + 1) * sizeof(wchar_t)); + return rs->text; +} + +/* * Lexical analysis of source files. */ -typedef struct tagToken token; -struct tagToken { +typedef struct token_Tag token; +struct token_Tag { int type; - char *text; + int cmd, aux; + wchar_t *text; + filepos pos; }; enum { - tok_eof, + tok_eof, /* end of file */ tok_eop, /* end of paragraph */ - tok_word, /* an ordinary word */ + tok_white, /* whitespace */ + tok_word, /* a word or word fragment */ tok_cmd, /* \command */ - tok_bracetext /* {text} */ + tok_lbrace, /* { */ + tok_rbrace /* } */ }; +/* Buttress command keywords. */ +enum { + c__invalid, /* invalid command */ + c__comment, /* comment command (\#) */ + c__escaped, /* escaped character */ + c_A, /* appendix heading */ + c_B, /* bibliography entry */ + c_BR, /* bibliography rewrite */ + c_C, /* chapter heading */ + c_H, /* heading */ + c_I, /* invisible index mark */ + c_IM, /* index merge/rewrite */ + c_K, /* capitalised cross-reference */ + c_S, /* aux field is 0, 1, 2, ... */ + c_U, /* unnumbered-chapter heading */ + c_W, /* Web hyperlink */ + c_b, /* bulletted list */ + c_c, /* code */ + c_copyright, /* copyright statement */ + c_cw, /* weak code */ + c_date, /* document processing date */ + c_e, /* emphasis */ + c_i, /* visible index mark */ + c_ii, /* uncapitalised visible index mark */ + c_k, /* uncapitalised cross-reference */ + c_n, /* numbered list */ + c_nocite, /* bibliography trickery */ + c_preamble, /* document preamble text */ + c_title, /* document title */ + c_u, /* aux field is char code */ + c_versionid /* document RCS id */ +}; + +/* Perhaps whitespace should be defined in a more Unicode-friendly way? */ +#define iswhite(c) ( (c)==32 || (c)==9 || (c)==13 || (c)==10 ) +#define isnl(c) ( (c)==10 ) +#define isdec(c) ( ((c)>='0'&&(c)<='9') ) +#define fromdec(c) ( (c)-'0' ) +#define ishex(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='F') || ((c)>='a'&&(c)<='f')) +#define fromhex(c) ( (c)<='9' ? (c)-'0' : ((c)&0xDF) - ('A'-10) ) +#define iscmd(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='Z') || ((c)>='a'&&(c)<='z')) + +/* + * Keyword comparison function. Like strcmp, but between a wchar_t * + * and a char *. + */ +static int kwcmp(wchar_t const *p, char const *q) { + int i; + do { + i = *p - *q; + } while (*p++ && *q++ && !i); + return i; +} + +/* + * Match a keyword. + */ +static void match_kw(token *tok) { + /* + * FIXME. The ids are explicit in here so as to allow long-name + * equivalents to the various very short keywords. + */ + static const struct { char const *name; int id; } keywords[] = { + {"#", c__comment}, /* comment command (\#) */ + {"A", c_A}, /* appendix heading */ + {"B", c_B}, /* bibliography entry */ + {"BR", c_BR}, /* bibliography rewrite */ + {"C", c_C}, /* chapter heading */ + {"H", c_H}, /* heading */ + {"I", c_I}, /* invisible index mark */ + {"IM", c_IM}, /* index merge/rewrite */ + {"K", c_K}, /* capitalised cross-reference */ + {"U", c_U}, /* unnumbered-chapter heading */ + {"W", c_W}, /* Web hyperlink */ + {"\\", c__escaped}, /* escaped backslash (\\) */ + {"b", c_b}, /* bulletted list */ + {"c", c_c}, /* code */ + {"copyright", c_copyright}, /* copyright statement */ + {"cw", c_cw}, /* weak code */ + {"date", c_date}, /* document processing date */ + {"e", c_e}, /* emphasis */ + {"i", c_i}, /* visible index mark */ + {"ii", c_ii}, /* uncapitalised visible index mark */ + {"k", c_k}, /* uncapitalised cross-reference */ + {"n", c_n}, /* numbered list */ + {"nocite", c_nocite}, /* bibliography trickery */ + {"preamble", c_preamble}, /* document preamble text */ + {"title", c_title}, /* document title */ + {"versionid", c_versionid}, /* document RCS id */ + {"{", c__escaped}, /* escaped lbrace (\{) */ + {"}", c__escaped}, /* escaped rbrace (\}) */ + }; + int i, j, k, c; + + /* + * Special cases: \S{0,1,2,...} and \uABCD. If the syntax + * doesn't match correctly, we just fall through to the + * binary-search phase. + */ + if (tok->text[0] == 'S') { + /* We expect numeric characters thereafter. */ + wchar_t *p = tok->text+1; + int n = 0; + while (*p && isdec(*p)) { + n = 10 * n + fromdec(*p); + p++; + } + if (!*p) { + tok->cmd = c_S; + tok->aux = n; + return; + } + } else if (tok->text[0] == 'u') { + /* We expect hex characters thereafter. */ + wchar_t *p = tok->text+1; + int n = 0; + while (*p && ishex(*p)) { + n = 16 * n + fromhex(*p); + p++; + } + if (!*p) { + tok->cmd = c_u; + tok->aux = n; + return; + } + } + + i = -1; + j = sizeof(keywords)/sizeof(*keywords); + while (j-i > 1) { + k = (i+j)/2; + c = kwcmp(tok->text, keywords[k].name); + if (c < 0) + j = k; + else if (c > 0) + i = k; + else /* c == 0 */ { + tok->cmd = keywords[k].id; + return; + } + } + + tok->cmd = c__invalid; +} + + +/* + * Read a token from the input file, in the normal way (`normal' in + * the sense that code paragraphs work a different way). + */ +token get_token(input *in) { + int c; + int nls; + token ret; + rdstring rs = { 0, 0, NULL }; + + ret.text = NULL; /* default */ + ret.pos = in->pos; + c = get(in); + if (iswhite(c)) { /* tok_white or tok_eop */ + nls = 0; + do { + if (isnl(c)) + nls++; + } while ((c = get(in)) != EOF && iswhite(c)); + unget(in, c); + ret.type = (nls > 1 ? tok_eop : tok_white); + return ret; + } else if (c == EOF) { /* tok_eof */ + ret.type = tok_eof; + return ret; + } else if (c == '\\') { /* tok_cmd */ + c = get(in); + if (c == '\\' || c == '#' || c == '{' || c == '}') { + /* single-char command */ + rdadd(&rs, c); + } else if (c == 'u') { + int len = 0; + do { + rdadd(&rs, c); + len++; + c = get(in); + } while (ishex(c) && len < 5); + unget(in, c); + } else if (iscmd(c)) { + do { + rdadd(&rs, c); + c = get(in); + } while (iscmd(c)); + unget(in, c); + } + /* + * Now match the command against the list of available + * ones. + */ + ret.type = tok_cmd; + ret.text = ustrdup(rs.text); + match_kw(&ret); + sfree(rs.text); + return ret; + } else if (c == '{') { /* tok_lbrace */ + ret.type = tok_lbrace; + return ret; + } else if (c == '}') { /* tok_rbrace */ + ret.type = tok_rbrace; + return ret; + } else { /* tok_word */ + /* + * Read a word: the longest possible contiguous sequence of + * things other than whitespace, backslash, braces and + * hyphen. A hyphen terminates the word but is returned as + * part of it; everything else is pushed back for the next + * token. + */ + while (1) { + if (iswhite(c) || c=='{' || c=='}' || c=='\\' || c==EOF) { + /* Put back the character that caused termination */ + unget(in, c); + break; + } else { + rdadd(&rs, c); + if (c == '-') + break; /* hyphen terminates word */ + } + c = get(in); + } + ret.type = tok_word; + ret.text = ustrdup(rs.text); + sfree(rs.text); + return ret; + } +} + +/* + * Determine whether the next input character is an open brace (for + * telling code paragraphs from paragraphs which merely start with + * code). + */ +int isbrace(input *in) { + int c; + + c = get(in); + unget(in, c); + return (c == '{'); +} + +/* + * Read the rest of a line that starts `\c'. Including nothing at + * all (tok_word with empty text). + */ +token get_codepar_token(input *in) { + int c; + token ret; + rdstring rs = { 0, 0, NULL }; + + ret.pos = in->pos; + ret.type = tok_word; + c = get(in); /* expect (and discard) one space */ + if (c == ' ') { + c = get(in); + ret.pos = in->pos; + } + while (!isnl(c) && c != EOF) { + rdadd(&rs, c); + c = get(in); + } + unget(in, c); + ret.text = ustrdup(rs.text); + sfree(rs.text); + return ret; +} + +/* + * Adds a new word to a linked list + */ +static void addword(word newword, word ***hptrptr) { + word *mnewword = smalloc(sizeof(word)); + *mnewword = newword; /* structure copy */ + mnewword->next = NULL; + **hptrptr = mnewword; + *hptrptr = &mnewword->next; +} + /* * Adds a new paragraph to a linked list */ -static paragraph addpara(paragraph ***hptrptr) { - paragraph *newpara = smalloc(sizeof(paragraph)); - newpara->next = NULL; - **hptrptr = newpara; - *hptrptr = &newpara->next; - return newpara; +static void addpara(paragraph newpara, paragraph ***hptrptr) { + paragraph *mnewpara = smalloc(sizeof(paragraph)); + *mnewpara = newpara; /* structure copy */ + mnewpara->next = NULL; + **hptrptr = mnewpara; + *hptrptr = &mnewpara->next; } /* * Reads a single file (ie until get() returns EOF) */ static void read_file(paragraph ***ret, input *in) { - int c; + token t; + paragraph par; + word wd, **whptr; + int style; + struct stack_item { + enum { + stack_ualt, /* \u alternative */ + stack_style, /* \e, \c, \cw */ + stack_idx, /* \I, \i, \ii */ + stack_nop /* do nothing (for error recovery) */ + } type; + word **whptr; /* to restore from \u alternatives */ + } *sitem; + stack parsestk; + /* + * Loop on each paragraph. + */ while (1) { + par.words = NULL; + par.keyword = NULL; + whptr = &par.words; + /* * Get a token. */ - token t = get_token(in); + t = get_token(in); if (t.type == tok_eof) return; - printf("token: %d\n", t.type); + + /* + * Parse code paragraphs separately. + */ + if (t.type == tok_cmd && t.cmd == c_c && !isbrace(in)) { + par.type = para_Code; + while (1) { + t = get_codepar_token(in); + wd.type = word_WeakCode; + wd.text = ustrdup(t.text); + addword(wd, &whptr); + t = get_token(in); + if (t.type == tok_white) { + /* + * The newline after a code-paragraph line + */ + t = get_token(in); + } + if (t.type == tok_eop || t.type == tok_eof) + break; + else if (t.type != tok_cmd || t.cmd != c_c) { + error(err_brokencodepara, &t.pos); + addpara(par, ret); + while (t.type != tok_eop) /* error recovery: */ + t = get_token(in); /* eat rest of paragraph */ + continue; + } + } + addpara(par, ret); + } + + /* + * This token begins a paragraph. See if it's one of the + * special commands that define a paragraph type. + * + * (note that \# is special in a way, and \nocite takes no + * text) + */ + par.type = para_Normal; + if (t.type == tok_cmd) { + int needkw; + + switch (t.cmd) { + default: + needkw = -1; + break; + case c__comment: + do { + t = get_token(in); + } while (t.type != tok_eop && t.type != tok_eof); + continue; /* next paragraph */ + /* + * `needkw' values: + * + * 0 -- no keywords at all + * 1 -- exactly one keyword + * 2 -- at least one keyword + * 4 -- any number of keywords including zero + * 8 -- at least one keyword and then nothing else + */ + case c_A: needkw = 2; par.type = para_Appendix; break; + case c_B: needkw = 2; par.type = para_Biblio; break; + case c_BR: needkw = 1; par.type = para_BR; break; + case c_C: needkw = 2; par.type = para_Chapter; break; + case c_H: needkw = 2; par.type = para_Heading; break; + case c_IM: needkw = 2; par.type = para_IM; break; + /* FIXME: multiple levels of Subsect */ + case c_S: needkw = 2; par.type = para_Subsect; break; + case c_U: needkw = 0; par.type = para_UnnumberedChapter; break; + /* For \b and \n the keyword is optional */ + case c_b: needkw = 4; par.type = para_Bullet; break; + case c_n: needkw = 4; par.type = para_NumberedList; break; + case c_copyright: needkw = 0; par.type = para_Copyright; break; + /* For \nocite the keyword is _everything_ */ + case c_nocite: needkw = 8; par.type = para_NoCite; break; + case c_preamble: needkw = 0; par.type = para_Preamble; break; + case c_title: needkw = 0; par.type = para_Title; break; + case c_versionid: needkw = 0; par.type = para_VersionID; break; + } + + if (needkw >= 0) { + rdstring rs = { 0, 0, NULL }; + int nkeys = 0; + filepos fp; + + /* Get keywords. */ + t = get_token(in); + fp = t.pos; + while (t.type == tok_lbrace) { + /* This is a keyword. */ + nkeys++; + /* FIXME: there will be bugs if anyone specifies an + * empty keyword (\foo{}), so trap this case. */ + while (t = get_token(in), + t.type == tok_word || t.type == tok_white) { + if (t.type == tok_white) + rdadd(&rs, ' '); + else + rdadds(&rs, t.text); + } + if (t.type != tok_rbrace) { + error(err_kwunclosed, &t.pos); + /* FIXME: memory leak */ + continue; + } + rdadd(&rs, 0); /* add string terminator */ + t = get_token(in); /* eat right brace */ + } + + rdadd(&rs, 0); /* add string terminator */ + + /* See whether we have the right number of keywords. */ + if (needkw == 0 && nkeys > 0) + error(err_kwillegal, &fp); + if ((needkw & 11) && nkeys == 0) + error(err_kwexpected, &fp); + if ((needkw & 5) && nkeys > 1) + error(err_kwtoomany, &fp); + + par.keyword = rdtrim(&rs); + + /* Move to EOP in case of needkw==8 (no body) */ + if (needkw == 8) { + if (t.type != tok_eop) { + error(err_bodyillegal, &t.pos); + while (t.type != tok_eop) /* error recovery: */ + t = get_token(in); /* eat rest of paragraph */ + } + addpara(par, ret); + continue; /* next paragraph */ + } + } + } + + /* + * Now read the actual paragraph, word by word, adding to + * the paragraph list. + * + * Mid-paragraph commands: + * + * \K \k + * \c \cw + * \e + * \i \ii + * \I + * \u + * \W + * \\ \{ \} + */ + parsestk = stk_new(); + style = word_Normal; + while (t.type != tok_eop && t.type != tok_eof) { + if (t.type == tok_cmd && t.cmd == c__escaped) + t.type = tok_word; /* nice and simple */ + switch (t.type) { + case tok_white: + wd.text = NULL; + wd.type = word_WhiteSpace; + addword(wd, &whptr); + break; + case tok_word: + wd.text = ustrdup(t.text); + wd.type = style; + addword(wd, &whptr); + break; + case tok_lbrace: + error(err_unexbrace, &t.pos); + /* FIXME: errorrec. Push nop. */ + break; + case tok_rbrace: + sitem = stk_pop(parsestk); + if (!sitem) + error(err_unexbrace, &t.pos); + else switch (sitem->type) { + case stack_ualt: + whptr = sitem->whptr; + break; + case stack_style: + style = word_Normal; + break; + case stack_idx: + /* FIXME: do this bit! */ + case stack_nop: + break; + } + sfree(sitem); + break; + case tok_cmd: + switch (t.cmd) { + case c_K: + case c_k: + /* + * Keyword. We expect a left brace, some text, + * and then a right brace. No nesting; no + * arguments. + */ + if (t.cmd == c_K) + wd.type = word_UpperXref; + else + wd.type = word_LowerXref; + t = get_token(in); + if (t.type != tok_lbrace) { + error(err_explbr, &t.pos); + } + { + rdstring rs = { 0, 0, NULL }; + while (t = get_token(in), + t.type == tok_word || t.type == tok_white) { + if (t.type == tok_white) + rdadd(&rs, ' '); + else + rdadds(&rs, t.text); + } + wd.text = ustrdup(rs.text); + } + if (t.type != tok_rbrace) { + error(err_kwexprbr, &t.pos); + } + addword(wd, &whptr); + break; + case c_c: + case c_cw: + case c_e: + if (style != word_Normal) { + error(err_nestedstyles, &t.pos); + /* Error recovery: eat lbrace, push nop. */ + t = get_token(in); + sitem = smalloc(sizeof(*sitem)); + sitem->type = stack_nop; + stk_push(parsestk, sitem); + } + t = get_token(in); + if (t.type != tok_lbrace) { + error(err_explbr, &t.pos); + } else { + style = (t.cmd == c_c ? word_Code : + t.cmd == c_cw ? word_WeakCode : + word_Emph); + sitem = smalloc(sizeof(*sitem)); + sitem->type = stack_style; + stk_push(parsestk, sitem); + } + break; + case c_i: + case c_ii: + case c_I: + if (style != word_Normal) { + error(err_nestedstyles, &t.pos); + /* Error recovery: eat lbrace, push nop. */ + t = get_token(in); + sitem = smalloc(sizeof(*sitem)); + sitem->type = stack_nop; + stk_push(parsestk, sitem); + } + t = get_token(in); + if (t.type != tok_lbrace) { + error(err_explbr, &t.pos); + } else { + /* + * FIXME: do something useful + * Add an index-ref word and keep a pointer to it + * Set a flag so that other addwords also update it + */ + sitem = smalloc(sizeof(*sitem)); + sitem->type = stack_idx; + stk_push(parsestk, sitem); + } + break; + case c_u: + case c_W: + default: + error(err_badmidcmd, t.text, &t.pos); + break; + } + } + t = get_token(in); + } + /* Check the stack is empty */ + if (NULL != (sitem = stk_pop(parsestk))) { + do { + sfree(sitem); + sitem = stk_pop(parsestk); + } while (sitem); + error(err_missingrbrace, &t.pos); + } + stk_free(parsestk); + addpara(par, ret); } } @@ -78,8 +718,14 @@ paragraph *read_input(input *in) { while (in->currindex < in->nfiles) { in->currfp = fopen(in->filenames[in->currindex], "r"); - if (in->currfp) + if (in->currfp) { + in->pos.filename = in->filenames[in->currindex]; + in->pos.line = 1; + in->pos.col = 1; read_file(&hptr, in); + } in->currindex++; } + + return head; } diff --git a/inputs/test.but b/inputs/test.but index ca159e9..fbc1441 100644 --- a/inputs/test.but +++ b/inputs/test.but @@ -6,7 +6,7 @@ feature that Buttress's input format supports. Creation date \copyright Copyright 1999 Simon Tatham. All rights reserved. -\versionid $Id: test.but,v 1.2 1999/02/07 13:17:47 simon Exp $ +\versionid $Id: test.but,v 1.3 1999/07/31 18:44:53 simon Exp $ \C{chap} First chapter title @@ -63,7 +63,7 @@ An index tag containing non-alternatived Unicode: \i{\u00BFChe?} An invisible index tag: \I{she seems to have an invisible tag}yeah. -\S0{subsub} Smaller heading still +\S1{subsub} Smaller heading still A tiny section. Awww. How cute. @@ -74,7 +74,7 @@ Here's an \i{appendix}, for no terribly good reason at all. See It also contains a \W{http://www.gallery.uk.eu.org/}{hyperlink}. -\U{bib} Bibliography +\U Bibliography \B{book} Some text describing a book. @@ -83,7 +83,7 @@ the document even though there is no \cw{\\k} citing it. \BR{book} [SillyCitation] -\nocite{nocite} +\nocite{bookwithoutcitation}{foo} \B{uncited} If this text appears, there's an actual error. @@ -112,7 +112,7 @@ int main(int argc, char **argv) { */ switch (c) { case 'o': - ofile = p; + outfile = p; break; } p = NULL; /* prevent continued processing */ @@ -153,7 +153,7 @@ int main(int argc, char **argv) { { input in; - paragraph sourceform; + paragraph *sourceform; in.filenames = infiles; in.nfiles = nfiles; @@ -174,9 +174,31 @@ int main(int argc, char **argv) { paragraph *p; word *w; for (p = sourceform; p; p = p->next) { - printf("para %d \"%s\" {\n", p->type, p->keyword); + wchar_t *wp; + printf("para %d ", p->type); + if (p->keyword) { + wp = p->keyword; + while (*wp) { + putchar('\"'); + for (; *wp; wp++) + putchar(*wp); + putchar('\"'); + if (*++wp) + printf(", "); + } + } else + printf("(no keyword)"); + printf(" {\n"); for (w = p->words; w; w = w->next) { - printf(" word %d \"%s\"\n", w->type, w->text); + printf(" word %d ", w->type); + if (w->text) { + printf("\""); + for (wp = w->text; *wp; wp++) + putchar(*wp); + printf("\""); + } else + printf("(no text)"); + printf("\n"); } printf("}\n"); } @@ -35,5 +35,29 @@ void *srealloc(void *p, int size) { q = malloc(size); if (!q) fatal(err_nomemory); - return p; + return q; +} + +/* + * Free a linked list of words + */ +void free_word_list(word *w) { + word *t; + while (w) { + t = w; + w = w->next; + sfree(t); + } +} + +/* + * Free a linked list of paragraphs + */ +void free_para_list(paragraph *p) { + paragraph *t; + while (p) { + t = p; + p = p->next; + sfree(t); + } } @@ -0,0 +1,42 @@ +/* + * misc.c: miscellaneous useful items + */ + +#include "buttress.h" + +struct stackTag { + void **data; + int sp; + int size; +}; + +stack stk_new(void) { + stack s; + + s = smalloc(sizeof(*s)); + s->sp = 0; + s->size = 0; + s->data = NULL; + + return s; +} + +void stk_free(stack s) { + sfree(s->data); + sfree(s); +} + +void stk_push(stack s, void *item) { + if (s->size <= s->sp) { + s->size = s->sp + 32; + s->data = srealloc(s->data, s->size * sizeof(*s->data)); + } + s->data[s->sp++] = item; +} + +void *stk_pop(stack s) { + if (s->sp > 0) + return s->data[--s->sp]; + else + return NULL; +} diff --git a/misc/buttress.sl b/misc/buttress.sl index 13efc03..0d7e207 100644 --- a/misc/buttress.sl +++ b/misc/buttress.sl @@ -1,4 +1,4 @@ -% The functions here are common to both TeX and LaTeX modes. +% Buttress mode for Jed. $1 = "Buttress"; create_syntax_table ($1); @@ -24,7 +24,7 @@ define_highlight_rule (".", "normal", $1); build_highlight_table ($1); #endif -% This hook identifies lines containing TeX comments as paragraph separator +% This hook identifies lines containing comments as paragraph separator define buttress_is_comment() { bol (); while (ffind ("\\\\#")) go_right (3); diff --git a/ustring.c b/ustring.c new file mode 100644 index 0000000..00e26b6 --- /dev/null +++ b/ustring.c @@ -0,0 +1,47 @@ +/* + * ustring.c: Unicode string routines + */ + +#include <wchar.h> +#include "buttress.h" + +wchar_t *ustrdup(wchar_t *s) { + wchar_t *r; + if (s) { + r = smalloc((1+ustrlen(s)) * sizeof(wchar_t)); + ustrcpy(r, s); + } else { + r = smalloc(1); + *r = 0; + } + return r; +} + +char *ustrtoa(wchar_t *s, char *outbuf, int size) { + char *p; + if (!s) { + *outbuf = '\0'; + return outbuf; + } + for (p = outbuf; *s && p < outbuf+size; p++,s++) + *p = *s; + if (p < outbuf+size) + *p = '\0'; + else + outbuf[size-1] = '\0'; + return outbuf; +} + +int ustrlen(wchar_t *s) { + int len = 0; + while (*s++) len++; + return len; +} + +wchar_t *ustrcpy(wchar_t *dest, wchar_t *source) { + wchar_t *ret = dest; + do { + *dest++ = *source; + } while (*source++); + return ret; +} |