URI: 
       txml.c - webdump - [FORK] git://git.codemadness.org/webdump
  HTML git clone git://git.z3bra.org/webdump.git
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       txml.c (10998B)
       ---
            1 #include <sys/types.h>
            2 
            3 #include <ctype.h>
            4 #include <errno.h>
            5 #include <limits.h>
            6 #include <stdio.h>
            7 #include <stdlib.h>
            8 #include <string.h>
            9 
           10 #include "xml.h"
           11 
           12 static void
           13 xml_parseattrs(XMLParser *x)
           14 {
           15         size_t namelen = 0, valuelen;
           16         int c, endsep, endname = 0, valuestart = 0;
           17 
           18         while ((c = GETNEXT()) != EOF) {
           19                 if (isspace(c)) {
           20                         if (namelen)
           21                                 endname = 1;
           22                         continue;
           23                 } else if (c == '?')
           24                         ; /* ignore */
           25                 else if (c == '=') {
           26                         x->name[namelen] = '\0';
           27                         valuestart = 1;
           28                         endname = 1;
           29                 } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
           30                         /* attribute without value */
           31                         x->name[namelen] = '\0';
           32                         if (x->xmlattrstart)
           33                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           34                         if (x->xmlattr)
           35                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
           36                         if (x->xmlattrend)
           37                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
           38                         endname = 0;
           39                         x->name[0] = c;
           40                         namelen = 1;
           41                 } else if (namelen && valuestart) {
           42                         /* attribute with value */
           43                         if (x->xmlattrstart)
           44                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           45 
           46                         valuelen = 0;
           47                         if (c == '\'' || c == '"') {
           48                                 endsep = c;
           49                         } else {
           50                                 endsep = ' '; /* isspace() */
           51                                 goto startvalue;
           52                         }
           53 
           54                         while ((c = GETNEXT()) != EOF) {
           55 startvalue:
           56                                 if (c == '&') { /* entities */
           57                                         x->data[valuelen] = '\0';
           58                                         /* call data function with data before entity if there is data */
           59                                         if (valuelen && x->xmlattr)
           60                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           61                                         x->data[0] = c;
           62                                         valuelen = 1;
           63                                         while ((c = GETNEXT()) != EOF) {
           64                                                 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
           65                                                         break;
           66                                                 if (valuelen < sizeof(x->data) - 1)
           67                                                         x->data[valuelen++] = c;
           68                                                 else {
           69                                                         /* entity too long for buffer, handle as normal data */
           70                                                         x->data[valuelen] = '\0';
           71                                                         if (x->xmlattr)
           72                                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           73                                                         x->data[0] = c;
           74                                                         valuelen = 1;
           75                                                         break;
           76                                                 }
           77                                                 if (c == ';') {
           78                                                         x->data[valuelen] = '\0';
           79                                                         if (x->xmlattrentity)
           80                                                                 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           81                                                         valuelen = 0;
           82                                                         break;
           83                                                 }
           84                                         }
           85                                 } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
           86                                         if (valuelen < sizeof(x->data) - 1) {
           87                                                 x->data[valuelen++] = c;
           88                                         } else {
           89                                                 x->data[valuelen] = '\0';
           90                                                 if (x->xmlattr)
           91                                                         x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           92                                                 x->data[0] = c;
           93                                                 valuelen = 1;
           94                                         }
           95                                 }
           96                                 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
           97                                         x->data[valuelen] = '\0';
           98                                         if (x->xmlattr)
           99                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          100                                         if (x->xmlattrend)
          101                                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
          102                                         break;
          103                                 }
          104                         }
          105                         namelen = endname = valuestart = 0;
          106                 } else if (namelen < sizeof(x->name) - 1) {
          107                         x->name[namelen++] = c;
          108                 }
          109                 if (c == '>') {
          110                         break;
          111                 } else if (c == '/') {
          112                         x->isshorttag = 1;
          113                         x->name[0] = '\0';
          114                         namelen = 0;
          115                 }
          116         }
          117 }
          118 
          119 static void
          120 xml_parsecomment(XMLParser *x)
          121 {
          122         size_t datalen = 0, i = 0;
          123         int c;
          124 
          125         if (x->xmlcommentstart)
          126                 x->xmlcommentstart(x);
          127         while ((c = GETNEXT()) != EOF) {
          128                 if (c == '-' || c == '>') {
          129                         if (x->xmlcomment && datalen) {
          130                                 x->data[datalen] = '\0';
          131                                 x->xmlcomment(x, x->data, datalen);
          132                                 datalen = 0;
          133                         }
          134                 }
          135 
          136                 if (c == '-') {
          137                         if (++i > 2) {
          138                                 if (x->xmlcomment)
          139                                         for (; i > 2; i--)
          140                                                 x->xmlcomment(x, "-", 1);
          141                                 i = 2;
          142                         }
          143                         continue;
          144                 } else if (c == '>' && i == 2) {
          145                         if (x->xmlcommentend)
          146                                 x->xmlcommentend(x);
          147                         return;
          148                 } else if (i) {
          149                         if (x->xmlcomment) {
          150                                 for (; i > 0; i--)
          151                                         x->xmlcomment(x, "-", 1);
          152                         }
          153                         i = 0;
          154                 }
          155 
          156                 if (datalen < sizeof(x->data) - 1) {
          157                         x->data[datalen++] = c;
          158                 } else {
          159                         x->data[datalen] = '\0';
          160                         if (x->xmlcomment)
          161                                 x->xmlcomment(x, x->data, datalen);
          162                         x->data[0] = c;
          163                         datalen = 1;
          164                 }
          165         }
          166 }
          167 
          168 static void
          169 xml_parsecdata(XMLParser *x)
          170 {
          171         size_t datalen = 0, i = 0;
          172         int c;
          173 
          174         if (x->xmlcdatastart)
          175                 x->xmlcdatastart(x);
          176         while ((c = GETNEXT()) != EOF) {
          177                 if (c == ']' || c == '>') {
          178                         if (x->xmlcdata && datalen) {
          179                                 x->data[datalen] = '\0';
          180                                 x->xmlcdata(x, x->data, datalen);
          181                                 datalen = 0;
          182                         }
          183                 }
          184 
          185                 if (c == ']') {
          186                         if (++i > 2) {
          187                                 if (x->xmlcdata)
          188                                         for (; i > 2; i--)
          189                                                 x->xmlcdata(x, "]", 1);
          190                                 i = 2;
          191                         }
          192                         continue;
          193                 } else if (c == '>' && i == 2) {
          194                         if (x->xmlcdataend)
          195                                 x->xmlcdataend(x);
          196                         return;
          197                 } else if (i) {
          198                         if (x->xmlcdata)
          199                                 for (; i > 0; i--)
          200                                         x->xmlcdata(x, "]", 1);
          201                         i = 0;
          202                 }
          203 
          204                 if (datalen < sizeof(x->data) - 1) {
          205                         x->data[datalen++] = c;
          206                 } else {
          207                         x->data[datalen] = '\0';
          208                         if (x->xmlcdata)
          209                                 x->xmlcdata(x, x->data, datalen);
          210                         x->data[0] = c;
          211                         datalen = 1;
          212                 }
          213         }
          214 }
          215 
          216 static int
          217 codepointtoutf8(long r, char *s)
          218 {
          219         if (r == 0) {
          220                 return 0; /* NUL byte */
          221         } else if (r <= 0x7F) {
          222                 /* 1 byte: 0aaaaaaa */
          223                 s[0] = r;
          224                 return 1;
          225         } else if (r <= 0x07FF) {
          226                 /* 2 bytes: 00000aaa aabbbbbb */
          227                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
          228                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
          229                 return 2;
          230         } else if (r <= 0xFFFF) {
          231                 /* 3 bytes: aaaabbbb bbcccccc */
          232                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
          233                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
          234                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
          235                 return 3;
          236         } else {
          237                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
          238                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
          239                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
          240                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
          241                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
          242                 return 4;
          243         }
          244 }
          245 
          246 struct namedentity {
          247         const char *entity;
          248         long cp;
          249 };
          250 
          251 int
          252 namedentitycmp(const void *v1, const void *v2)
          253 {
          254         struct namedentity *n1 = (struct namedentity *)v1;
          255         struct namedentity *n2 = (struct namedentity *)v2;
          256 
          257         return strcmp(n1->entity, n2->entity);
          258 }
          259 
          260 static int
          261 namedentitytostr(const char *e, char *buf, size_t bufsiz)
          262 {
          263         static const struct namedentity entities[] = {
          264 #include "namedentities.h"
          265         };
          266         struct namedentity find, *found;
          267         size_t i;
          268 
          269         /* buffer is too small */
          270         if (bufsiz < 5)
          271                 return -1;
          272 
          273         find.entity = e;
          274         found = bsearch(&find, entities, sizeof(entities) / sizeof(*entities),
          275                 sizeof(*entities), namedentitycmp);
          276         if (found) {
          277                 i = codepointtoutf8(found->cp, buf);
          278                 buf[i] = '\0';
          279                 return i;
          280         }
          281         return -1;
          282 }
          283 
          284 static int
          285 numericentitytostr(const char *e, char *buf, size_t bufsiz)
          286 {
          287         long l;
          288         int len;
          289         char *end;
          290 
          291         /* buffer is too small */
          292         if (bufsiz < 5)
          293                 return -1;
          294 
          295         errno = 0;
          296         /* hex (16) or decimal (10) */
          297         if (*e == 'x')
          298                 l = strtol(++e, &end, 16);
          299         else
          300                 l = strtol(e, &end, 10);
          301         /* invalid value or not a well-formed entity or invalid codepoint */
          302         if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff)
          303                 return -1;
          304         len = codepointtoutf8(l, buf);
          305         buf[len] = '\0';
          306 
          307         return len;
          308 }
          309 
          310 /* convert named- or numeric entity string to buffer string
          311  * returns byte-length of string or -1 on failure. */
          312 int
          313 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
          314 {
          315         /* doesn't start with & */
          316         if (e[0] != '&')
          317                 return -1;
          318         /* numeric entity */
          319         if (e[1] == '#')
          320                 return numericentitytostr(e + 2, buf, bufsiz);
          321         else /* named entity */
          322                 return namedentitytostr(e + 1, buf, bufsiz);
          323 }
          324 
          325 void
          326 xml_parse(XMLParser *x)
          327 {
          328         size_t datalen, tagdatalen;
          329         int c, isend;
          330 
          331         while ((c = GETNEXT()) != EOF && c != '<')
          332                 ; /* skip until < */
          333 
          334         while (c != EOF) {
          335                 if (c == '<') { /* parse tag */
          336                         if ((c = GETNEXT()) == EOF)
          337                                 return;
          338 
          339                         if (c == '!') { /* cdata and comments */
          340                                 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
          341                                         /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
          342                                         if (tagdatalen <= sizeof("[CDATA[") - 1)
          343                                                 x->data[tagdatalen++] = c;
          344                                         if (c == '>')
          345                                                 break;
          346                                         else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
          347                                                         (x->data[0] == '-')) {
          348                                                 xml_parsecomment(x);
          349                                                 break;
          350                                         } else if (c == '[') {
          351                                                 if (tagdatalen == sizeof("[CDATA[") - 1 &&
          352                                                     !strncmp(x->data, "[CDATA[", tagdatalen)) {
          353                                                         xml_parsecdata(x);
          354                                                         break;
          355                                                 }
          356                                         }
          357                                 }
          358                         } else {
          359                                 /* normal tag (open, short open, close), processing instruction. */
          360                                 x->tag[0] = c;
          361                                 x->taglen = 1;
          362                                 x->isshorttag = isend = 0;
          363 
          364                                 /* treat processing instruction as shorttag, don't strip "?" prefix. */
          365                                 if (c == '?') {
          366                                         x->isshorttag = 1;
          367                                 } else if (c == '/') {
          368                                         if ((c = GETNEXT()) == EOF)
          369                                                 return;
          370                                         x->tag[0] = c;
          371                                         isend = 1;
          372                                 }
          373 
          374                                 while ((c = GETNEXT()) != EOF) {
          375                                         if (c == '/')
          376                                                 x->isshorttag = 1; /* short tag */
          377                                         else if (c == '>' || isspace(c)) {
          378                                                 x->tag[x->taglen] = '\0';
          379                                                 if (isend) { /* end tag, starts with </ */
          380                                                         if (x->xmltagend)
          381                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          382                                                         x->tag[0] = '\0';
          383                                                         x->taglen = 0;
          384                                                 } else {
          385                                                         /* start tag */
          386                                                         if (x->xmltagstart)
          387                                                                 x->xmltagstart(x, x->tag, x->taglen);
          388                                                         if (isspace(c))
          389                                                                 xml_parseattrs(x);
          390                                                         if (x->xmltagstartparsed)
          391                                                                 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
          392                                                 }
          393                                                 /* call tagend for shortform or processing instruction */
          394                                                 if (x->isshorttag) {
          395                                                         if (x->xmltagend)
          396                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          397                                                         x->tag[0] = '\0';
          398                                                         x->taglen = 0;
          399                                                 }
          400                                                 break;
          401                                         } else if (x->taglen < sizeof(x->tag) - 1)
          402                                                 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
          403                                 }
          404                         }
          405                 } else {
          406                         /* parse tag data */
          407                         datalen = 0;
          408                         if (x->xmldatastart)
          409                                 x->xmldatastart(x);
          410                         while ((c = GETNEXT()) != EOF) {
          411                                 if (c == '&') {
          412                                         if (datalen) {
          413                                                 x->data[datalen] = '\0';
          414                                                 if (x->xmldata)
          415                                                         x->xmldata(x, x->data, datalen);
          416                                         }
          417                                         x->data[0] = c;
          418                                         datalen = 1;
          419                                         while ((c = GETNEXT()) != EOF) {
          420                                                 if (c == '<')
          421                                                         break;
          422                                                 if (datalen < sizeof(x->data) - 1)
          423                                                         x->data[datalen++] = c;
          424                                                 else {
          425                                                         /* entity too long for buffer, handle as normal data */
          426                                                         x->data[datalen] = '\0';
          427                                                         if (x->xmldata)
          428                                                                 x->xmldata(x, x->data, datalen);
          429                                                         x->data[0] = c;
          430                                                         datalen = 1;
          431                                                         break;
          432                                                 }
          433                                                 if (c == ';') {
          434                                                         x->data[datalen] = '\0';
          435                                                         if (x->xmldataentity)
          436                                                                 x->xmldataentity(x, x->data, datalen);
          437                                                         datalen = 0;
          438                                                         break;
          439                                                 }
          440                                         }
          441                                 } else if (c != '<') {
          442                                         if (datalen < sizeof(x->data) - 1) {
          443                                                 x->data[datalen++] = c;
          444                                         } else {
          445                                                 x->data[datalen] = '\0';
          446                                                 if (x->xmldata)
          447                                                         x->xmldata(x, x->data, datalen);
          448                                                 x->data[0] = c;
          449                                                 datalen = 1;
          450                                         }
          451                                 }
          452                                 if (c == '<') {
          453                                         x->data[datalen] = '\0';
          454                                         if (x->xmldata && datalen)
          455                                                 x->xmldata(x, x->data, datalen);
          456                                         if (x->xmldataend)
          457                                                 x->xmldataend(x);
          458                                         break;
          459                                 }
          460                         }
          461                 }
          462         }
          463 }