/*************************************************************************** * __________ __ ___. * Open \______ \ ____ ____ | | _\_ |__ _______ ___ * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ * \/ \/ \/ \/ \/ * $Id$ * * Copyright (C) 2007 Matthias Larisch * Copyright (C) 2007 Adam Gashlin (hcs) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY * KIND, either express or implied. * ****************************************************************************/ /* Usage: xmlconv wiki_xml output_prefix generates output_prefix.wwr, output_prefix.wwt, output_prefixN.wwa where N is 0-? (each file ~ 1 GB) This is a rewrite from original ruby converter (2007 by Frederik Vestre) */ #include #include #include #include #include #include #include "xmlconv.h" #include "xmlentities.h" static FILE *titlefile; static FILE *redirectfile; static FILE *outfile; static int outfilenum; static char outfilename[PATH_MAX]; static pcre *regex_redirect; static int ovector_temp[30]; static int matchcount_temp; static article_header buf_article_header; static title_entry buf_title_entry; static void strreverse(char* begin, char* end) { char aux; while(end>begin) { aux=*end; *end--=*begin; *begin++=aux; } } static void itoa(int value, char* str, int base) { static char num[] = "0123456789abcdefghijklmnopqrstuvwxyz"; char* wstr=str; int sign; /* Validate base */ if (base<2 || base>35) { *wstr='\0'; return; } /* Take care of sign */ if ((sign=value) < 0) value = -value; /* Conversion. Number is reversed. */ do { *wstr++ = num[value%base]; } while(value/=base); if(sign<0) *wstr++='-'; *wstr='\0'; /* Reverse string */ strreverse(str,wstr-1); } static int open_outfile(int mode) { char filename[PATH_MAX]; strcpy(filename,outfilename); if(mode==1) outfilenum++; itoa(outfilenum,filename+strlen(filename),10); strcat(filename,".wwa"); if(mode==1) { if((outfile=fopen(filename,"w"))==NULL) { printf("Error on creating article output file\n"); return 0; } } else if((outfile=fopen(filename,"a"))==NULL) { printf("Error on reopening article output file\n"); return 0; } return 1; } static void insert_redirect(char redirect_from[255], char redirect_to[255]) { unsigned int buffer; buffer=strlen(redirect_from); fwrite(&buffer, sizeof(unsigned int), 1, redirectfile); buffer=strlen(redirect_to); fwrite(&buffer, sizeof(unsigned int), 1, redirectfile); fputs(redirect_from,redirectfile); fputs(redirect_to,redirectfile); } /* write a unicode value to a character buffer as UTF-8 increments *offset by numer of bytes written to buf by hcs, based on the wikipedia article on UTF-8 */ static void writeUTF8(char * buf, int * offset, int value) { if (value>=0 && value<=0x7f) buf[(*offset)++]=value; else if (value>=0x80 && value <=0x7ff) { buf[(*offset)++]=((value >> 6)&0x1f)|0xC0; buf[(*offset)++]=(value&0x3f)|0x80; } else if (value>=0x800 && value <=0xffff) { buf[(*offset)++]=((value >> 12)&0xf)|0xe0; buf[(*offset)++]=((value >> 6)&0x3f)|0x80; buf[(*offset)++]=(value&0x3f)|0x80; } else if (value>=0x10000 && value <= 0x10ffff) { buf[(*offset)++]=((value >> 18)&0xf)|0xf0; buf[(*offset)++]=((value >> 12)&0x3f)|0x80; buf[(*offset)++]=((value >> 6)&0x3f)|0x80; buf[(*offset)++]=(value&0x3f)|0x80; } else perror("writeUTF8"); } /* wiki code and HTML parsing (by hcs) */ static int inbold; static int initalic; /* int parse_(char * outbuf, const char * inbuf, int * const inlen); outbuf: output buffer inbuf: input buffer inlength: total length of input buffer (few parsers will consume the entire input), decremented by number of bytes consumed */ #define CONSUME(amt) { const int tempc=(amt); inbuf+=tempc; inlen-=tempc; } #define PRODUCE(c) { (*(outbuf++))=(c); } #define PRODUCEUTF8(c) { int temp=0; writeUTF8(outbuf, &temp, (c)); outbuf+=temp; } /* between PARSEHEAD and PARSETAIL goes a block for the body of the while loop */ #define PARSEHEAD(name) static void doparse_ ## name(char ** outbufptr, const char ** inbufptr, int * const inlenptr) { \ const char * inbuf = *inbufptr; \ char * outbuf = *outbufptr; \ int inlen = *inlenptr; \ while (inlen>0) #define PARSETAIL \ *inbufptr = inbuf; \ *outbufptr = outbuf; \ *inlenptr = inlen; \ } #define DESCEND(name) doparse_ ## name(&outbuf,&inbuf,&inlen) #define PASS { PRODUCE(inbuf[0]); CONSUME(1); } #define PARSEPROTO(name) static void doparse_ ## name(char ** outbufptr, const char ** inbufptr, int * const inlenptr) #define PARSEPROTOINLINE(name) static inline void doparse_ ## name(char ** outbufptr, const char ** inbufptr, int * const inlenptr) /* there has got to be a better way */ PARSEPROTO(comment); PARSEPROTO(linkname); PARSEPROTO(link); PARSEPROTO(ref); PARSEPROTO(article); PARSEPROTOINLINE(text); /* assume we're just past the comment opening tag */ PARSEHEAD(comment) { if (inbuf[0]=='-' && !memcmp(inbuf+1,"->",5)) { CONSUME(6); break; } else CONSUME(1); } PARSETAIL PARSEHEAD(linkname) { if (inbuf[0]=='|') { CONSUME(1); /* take back anything we've written so far, only the last is display text */ outbuf=*outbufptr; } else if (inbuf[0]==']' && inbuf[1]==']') break; else if (inbuf[0]=='[' && inbuf[1]=='[') { /* for instance, links in image caption */ PRODUCE(MARKUP_STARTLINK); CONSUME(2); DESCEND(link); } else DESCEND(text); } PARSETAIL PARSEHEAD(link) { if (inbuf[0]==']' && inbuf[1]==']') { CONSUME(2); PRODUCE(MARKUP_ENDLINK); break; } else if (inbuf[0]=='|') { CONSUME(1); PRODUCE(MARKUP_MODE); DESCEND(linkname); /* should come back with ]] */ CONSUME(2); PRODUCE(MARKUP_ENDLINK); break; } else DESCEND(text); } PARSETAIL PARSEHEAD(ref) { if (inbuf[0]=='&' && !memcmp(inbuf+1,"lt;/",4) && tolower(inbuf[5])=='r' && tolower(inbuf[6])=='e' && tolower(inbuf[7])=='f' && !memcmp(inbuf+8,">",4)) { CONSUME(12); break; } else CONSUME(1); } PARSETAIL PARSEHEAD(article) { /* links */ switch (inbuf[0]) { case '[': if (!memcmp(inbuf,"[[",2)) { /* better-than-nothing image processing */ if (!memcmp(inbuf+3,"mage:",5) && (inbuf[2]=='i' || inbuf[2]=='I')) { CONSUME(2); PRODUCE(MARKUP_STARTLINK); while (inbuf[0]!='|' && memcmp(inbuf,"]]",2) ) { PASS; } PRODUCE(MARKUP_MODE); PRODUCE('['); PRODUCE('I'); PRODUCE(']'); PRODUCE(MARKUP_ENDLINK); PRODUCE(' '); if (!memcmp(inbuf,"]]",2)) { CONSUME(2); } else { CONSUME(1); DESCEND(linkname); /* should come back with ]] */ CONSUME(2); PRODUCE('\n'); } } else { PRODUCE(MARKUP_STARTLINK); CONSUME(2); DESCEND(link); } break; } default: DESCEND(text); } } PARSETAIL static int entity_comparer(const void *_key, const void *_ent) { const char* key = (const char*) _key; struct html_entity* ent = (struct html_entity*) _ent; return strncmp(key, ent->string, ent->string_length); } /* stuff that can be anywhere in plain text */ /* only processes current character (unless it is something we recognize) */ static inline void doparse_text(char ** outbufptr, const char ** inbufptr, int * const inlenptr) { const char * inbuf = *inbufptr; char * outbuf = *outbufptr; int inlen = *inlenptr; switch (inbuf[0]) { case '\0': printf("Read zero but end of string not reached\n"); CONSUME(1); break; case '\'': if (!memcmp(inbuf+1,"''",2)) { CONSUME(3); if (inbold) { PRODUCE(MARKUP_ENDBOLD); inbold=0; } else { PRODUCE(MARKUP_STARTBOLD); inbold=1; } break; } if (inbuf[1]=='\'') { CONSUME(2); if (initalic) { PRODUCE(MARKUP_ENDITALIC); initalic=0; } else { PRODUCE(MARKUP_STARTITALIC); initalic=1; } break; } PASS; break; case '&': /* comment */ if (!memcmp(inbuf+1,"lt;!--",6)) { CONSUME(7); DESCEND(comment); break; } /* references (really clutter things up) */ if (!memcmp(inbuf+1,"lt;",3) && tolower(inbuf[4])=='r' && tolower(inbuf[5])=='e' && tolower(inbuf[6])=='f' && (inbuf[7]=='/' || inbuf[7]==' ' || inbuf[7]=='&')) { int i; CONSUME(7); /* handle (stanislaw lem) */ /* find end angle bracket */ for (i=0;; i++) { if (inbuf[i]=='&' && !memcmp(inbuf+i+1,"gt;",3)) break; } /* check for self-terminating tag */ if (inbuf[i-1]=='/') { CONSUME(i+4); } else { DESCEND(ref); } break; } /* entities */ /* & > < and " are necessary for storage in XML */ if (!memcmp(inbuf+1,"amp;",4)) { /* might contain another entitized entity (– appears in the dump as &ndash;) */ CONSUME(5); if (inbuf[0] == '#') /* numeric (thus far untested) */ { int consumed=0; int value=0; char d; consumed++; if (inbuf[consumed]=='x') { /* hexadecimal */ for (consumed++; isxdigit(d=inbuf[consumed]); consumed++) { if (isupper(d)) value=value*16+d-'A'+10; else if (islower(d)) value=value*16+d-'a'+10; else if (isdigit(d)) value=value*16+d-'0'; else break; /* this condition should be redundant */ } /* I do not consider &#x; to be a valid entity */ if (consumed > 3 && inbuf[consumed]==';') { #ifdef DEBUG { int i; for (i=0; i<6 && inbuf[i]!=';' && inbuf[i]!='&'; i++) ; if (inbuf[i]==';') { printf("numeric entity: "); for (i=0; inbuf[i]!=';'; i++) { printf("%c",inbuf[i]); } printf(" evaluated as 0x%x\n",value); } } #endif consumed++; PRODUCEUTF8(value); CONSUME(consumed); break; } /* otherwise fall through to normal handling */ } else /* decimal */ { for (; isdigit(d=inbuf[consumed]); consumed++) { if (isdigit(d)) value=value*10+d-'0'; else break; /* this condition should be redundant */ } /* I do not consider &#; to be a valid entity */ if (consumed > 2 && inbuf[consumed]==';') { #ifdef DEBUG { int i; for (i=0; i<6 && inbuf[i]!=';' && inbuf[i]!='&'; i++) ; if (inbuf[i]==';') { printf("numeric entity: "); for (i=0; inbuf[i]!=';'; i++) { printf("%c",inbuf[i]); } printf(" evaluated as %d\n",value); } } #endif consumed++; PRODUCEUTF8(value); CONSUME(consumed); } /* otherwise fall through to normal handling */ } } struct html_entity* result = bsearch(inbuf, entities, ENT_COUNT, sizeof(struct html_entity), &entity_comparer); if(result != NULL) { PRODUCEUTF8(result->utf8); CONSUME(result->string_length); break; } { int i; for (i=0; i<6 && inbuf[i]!=';' && (isalpha(inbuf[i]) || isdigit(inbuf[i])); i++) ; if (inbuf[i]==';') { printf("unsupported entity: "); for (i=0; inbuf[i]!=';'; i++) { printf("%c",inbuf[i]); } printf("\n"); } } PRODUCE('&'); break; } if (!memcmp(inbuf+1,"lt;",3)) { PRODUCE('<'); CONSUME(4); break; } if (!memcmp(inbuf+1,"gt;",3)) { PRODUCE('>'); CONSUME(4); break; } if (!memcmp(inbuf+1,"quot;",5)) { PRODUCE('"'); CONSUME(6); break; } { int i; printf ("unknown entity at top level: "); for (i=0; inbuf[i]!=';'; i++) { printf("%c",inbuf[i]); } printf("\n"); } PASS; break; default: PASS; } PARSETAIL static int parse_article(char * article_parsed, char * article, int article_length_temp) { char * article_parsed_end = article_parsed; const char * article_end = article; int templen = article_length_temp; inbold=0; initalic=0; doparse_article(&article_parsed_end,&article_end,&templen); return article_parsed_end-article_parsed; } /* */ static void do_article(int id, char title[255], char *article, int article_length) { /* Redirect processing, redirects are short in general... speedup :) */ if(article_length<1000) { matchcount_temp=pcre_exec(regex_redirect, NULL, article, article_length, 0, PCRE_NOTEMPTY, ovector_temp, 30); if(matchcount_temp>1) { char redirect_to[255]; pcre_copy_substring(article, ovector_temp, matchcount_temp, 1, redirect_to, 255); if(strlen(redirect_to)<1) printf("Could not process redirect %s\n",title); else { insert_redirect(title, redirect_to); return; } } } static char article_parsed[S_ARTBUF*2]; int article_parsed_length=0; /* This function processes the stylistics in the article. We give the pointer as parameter so it changes our article in same memory */ article_parsed_length=parse_article(article_parsed, article, article_length); /* At this point we should have the correct article in article-string. This needs to be compressed using gzip and then written to the outfile: */ buf_article_header.id=id; buf_article_header.article_length=article_parsed_length; buf_article_header.title_length=strlen(title); if((int)ftell(outfile)+buf_article_header.article_length >= 0x40000000) { /* Begin new output file at about 1gb */ fclose(outfile); open_outfile(1); } buf_title_entry.filenumber=outfilenum; buf_title_entry.fileposition=(int)ftello(outfile); buf_title_entry.title_length=buf_article_header.title_length; fwrite (&buf_title_entry, sizeof(buf_title_entry), 1, titlefile); fwrite (title, sizeof(char), buf_title_entry.title_length, titlefile); gzFile *gzipfile; if((gzipfile=gzdopen(dup(fileno(outfile)),"w9"))==NULL) { perror("gzdopen"); return; } gzwrite(gzipfile, &buf_article_header, sizeof(buf_article_header)); gzwrite(gzipfile, title, buf_title_entry.title_length); gzwrite(gzipfile, article_parsed, article_parsed_length); gzflush(gzipfile, Z_FINISH); gzclose(gzipfile); fseeko(outfile,0,SEEK_END); } static void display_progress(long value, long max) { long i, normalized_val = ((long double)value/(long double)max)*80; max -= 5; /* sizeof("[" "]xx%") */ printf("["); for(i=0; i<80; i++) printf("%c", i < normalized_val ? '*' : ' '); printf("]%02d%%\r", (int) (((long double)value/(long double)max)*100)); fflush(stdout); } int main(int argc,char * argv[]) { printf("Wikipedia XML Converter 0.12\n"); if(argc < 3) { printf("Help:\nThis program converts wikipedia-xml-dump to rockbox ww format.\n"); printf("Usage: xmlconv \n"); printf("example: xmlconv dewiki.xml meindewiki\n"); return 0; } FILE *xmldump; strcpy(outfilename,argv[2]); if (argv[1][0]=='-' && argv[1][1]=='\0') { printf("working on stdin with output prefix %s\n",argv[2]); xmldump = stdin; } else { printf("working on %s with output prefix %s\n",argv[1], outfilename); if((xmldump=fopen(argv[1],"rb"))==NULL) { printf("Error on opening input file\n"); return 1; } } if((titlefile=fopen(strcat(outfilename,".wwt"),"wb"))==NULL) { printf("Error on opening title output file\n"); return 1; } strcpy(outfilename,argv[2]); if((redirectfile=fopen(strcat(outfilename,".wwr"),"wb"))==NULL) { printf("Error on opening redirect output file\n"); return 1; } strcpy(outfilename,argv[2]); outfilenum=-1; if(!open_outfile(1)) return 1; long long int dumplen=0, actlen=0; if (xmldump != stdin) { fseeko(xmldump,0,SEEK_END); dumplen=ftello(xmldump); fseeko(xmldump,0,SEEK_SET); } else dumplen=-1; char linebuffer[S_ROWBUF]; int i=0; const char *error=NULL; int erroffset; int ovector_open[30], ovector_end[30]; int matches_open, matches_closed; pcre *regex_tagname, *regex_endtag, *regex_endtag_text, *regex_betweentags; regex_tagname=pcre_compile("<([a-zA-Z0-9]+)[^>/]*(/)?>", 0, &error, &erroffset, NULL); regex_endtag=pcre_compile("]*>", 0, &error, &erroffset, NULL); regex_endtag_text=pcre_compile("([^<]*)]*>", 0, &error, &erroffset, NULL); regex_betweentags=pcre_compile("<([a-zA-Z0-9]+)[^>]*>([^<]*)(]*>)?", 0, &error, &erroffset, NULL); regex_redirect=pcre_compile("(?i)#REDIRECT:? ?\\[\\[([^\\]]*)]]", 0, &error, &erroffset, NULL); char tagbuf_o[30]; char tagbuf_c[30]; char title[255]; int id=0; char article[S_ARTBUF+1]; char buffer[255]; int article_length=0; int mode=TAG_OUTSIDE; i=0; while (fgets(linebuffer,8191,xmldump)!=0) { i++; if(i%5000==0 && dumplen > 0) { actlen=ftello(xmldump); display_progress(actlen, dumplen); } linebuffer[strlen(linebuffer)-1]=0; matches_open=pcre_exec(regex_tagname, NULL, linebuffer, strlen(linebuffer), 0, PCRE_NOTEMPTY, ovector_open, 30); matches_closed=pcre_exec(regex_endtag, NULL, linebuffer, strlen(linebuffer), 0, PCRE_NOTEMPTY, ovector_end, 30); if(matches_open>=2) { pcre_copy_substring(linebuffer, ovector_open, matches_open, 1, tagbuf_o, 30); if(strcasecmp(tagbuf_o, "page")==0) { if(mode!=TAG_OUTSIDE) printf("page begin but not outside before\n"); mode=TAG_PAGE; } if(strcasecmp(tagbuf_o, "title")==0 && mode==TAG_PAGE) { if((matches_open=pcre_exec(regex_betweentags, NULL, linebuffer, strlen(linebuffer), 0, PCRE_NOTEMPTY, ovector_open, 30))<1) printf("title empty\n"); else pcre_copy_substring(linebuffer, ovector_open, matches_open, 2, title, 255); } if(strcasecmp(tagbuf_o, "revision")==0) { if(mode!=TAG_PAGE) printf("revision but not in page"); mode=TAG_REVISION; } if(strcasecmp(tagbuf_o, "id")==0 && mode==TAG_REVISION) { if((matches_open=pcre_exec(regex_betweentags, NULL, linebuffer, strlen(linebuffer), 0, PCRE_NOTEMPTY, ovector_open, 30))<1) printf("id empty on %s\n", title); else { pcre_copy_substring(linebuffer, ovector_open, matches_open, 2, buffer, 255); id=atoi(buffer); } } if(strcasecmp(tagbuf_o, "contributor")==0) { if(mode!=TAG_REVISION) printf("contributor but not in revision"); mode=TAG_CONTRIBUTOR; } if(strcasecmp(tagbuf_o, "text")==0) { if(mode!=TAG_REVISION) { printf("text but not in revision"); if(mode!=TAG_PAGE) printf("and even not in page"); } article[0]=0; /* clear article buffer */ article_length=0; /* sync length */ if(matches_open!=3) mode=TAG_TEXT; /* When there are 3 matches -> matched */ else continue; /* When 3 matches -> no text so go on */ if((matches_open=pcre_exec(regex_betweentags, NULL, linebuffer, strlen(linebuffer), 0, 0, ovector_open, 30))<1) printf("text didnt match second time\n"); else { pcre_copy_substring(linebuffer, ovector_open, matches_open, 2, article, 8191); article_length=strlen(article); } } } else if(mode==TAG_TEXT && matches_closed<2) { *(article+article_length)='\n'; strcpy(article+article_length+1,linebuffer); article_length+=(strlen(linebuffer)+1); if(article_length>=S_ARTBUF) { printf("Article buffer overflow! Bad sourcefile or increase buffersize\n"); return 0; } } if(matches_closed==2) { pcre_copy_substring(linebuffer, ovector_end, matches_closed, 1, tagbuf_c, 30); if(strcasecmp(tagbuf_c, "contributor")==0) { if(mode!=TAG_CONTRIBUTOR) printf("contributor end but not opened"); mode=TAG_REVISION; } if(strcasecmp(tagbuf_c, "revision")==0) { if(mode!=TAG_REVISION) printf("revision end but not opened"); mode=TAG_PAGE; } if(strcasecmp(tagbuf_c, "text")==0) { if(mode!=TAG_TEXT) printf("text end but not opened"); mode=TAG_REVISION; do_article(id, title, article, article_length); } if(strcasecmp(tagbuf_c, "page")==0) { if(mode!=TAG_PAGE) printf("page end but not opened"); mode=TAG_OUTSIDE; } } } fclose(titlefile); fclose(redirectfile); fclose(outfile); fclose(xmldump); return 0; }