From 7220b02603cfd33775e56da19fe9f5fb1da08aa0 Mon Sep 17 00:00:00 2001 From: Vincent Sanders Date: Thu, 13 Jul 2017 01:03:23 +0100 Subject: got references parsing --- src/xref.c | 228 +++++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 192 insertions(+), 36 deletions(-) diff --git a/src/xref.c b/src/xref.c index 6af8132..e1f3a9d 100644 --- a/src/xref.c +++ b/src/xref.c @@ -3,6 +3,7 @@ #include #include #include +#include #define SLEN(x) (sizeof((x)) - 1) @@ -14,7 +15,7 @@ #define BC_DELM (1<<4) /* character is a delimiter */ /** - * byte classification + * pdf byte classification */ uint8_t bclass[] = { BC_WSPC, BC_NONE, BC_NONE, BC_NONE, /* 00 - 03 */ @@ -126,7 +127,7 @@ struct cos_dictionary_entry { struct cos_reference { /** id of indirect object */ uint64_t id; - + /* generation of indirect object */ uint64_t generation; }; @@ -136,19 +137,19 @@ struct cos_object { union { /** boolean */ bool b; - + /** integer */ int64_t i; - + /** real */ double r; - + /** name */ char *n; - + /** string */ char *s; - + /** stream data */ uint8_t *stream; @@ -157,7 +158,7 @@ struct cos_object { /** reference */ struct cos_reference *reference; - + } u; }; @@ -169,10 +170,10 @@ struct cos_indirect_object { /* reference identifier */ struct cos_reference ref; - + /** offset of object */ uint64_t offset; - + /* direct object */ struct cos_object *o; }; @@ -367,7 +368,7 @@ int cos_indirect_object_add(struct pdf_doc *doc, nobj->offset = obj_offset; doc->cos_list = nobj; - + printf("xref %"PRIu64" %"PRIu64" %"PRIu64"\n", obj_number, obj_offset, obj_generation); return 0; @@ -390,10 +391,10 @@ int cos_free_object(struct cos_object *cos_obj) dentry = cos_obj->u.dictionary; while (dentry != NULL) { struct cos_dictionary_entry *odentry; - + cos_free_object(dentry->key); cos_free_object(dentry->value); - + odentry = dentry; dentry = dentry->next; free(odentry); @@ -414,7 +415,48 @@ int cos_decode_number(struct pdf_doc *doc, uint64_t *offset_out, struct cos_object **cosobj_out) { - return -1; + struct cos_object *cosobj; + uint8_t c; /* current byte from source data */ + int len; /* number of decimal places in number */ + uint8_t num[21]; /* temporary buffer for decimal values */ + uint64_t offset; /* current offset of source data */ + + offset = *offset_out; + + for (len = 0; len < sizeof(num); len++) { + c = DOC_BYTE(doc, offset); + if ((bclass[c] & BC_DCML) != BC_DCML) { + int64_t result = 0; /* parsed result */ + uint64_t tens; + + if (len == 0) { + return -2; /* parse error no decimals in input */ + } + /* sum value from each place */ + for (tens = 1; len > 0; tens = tens * 10, len--) { + result += (num[len - 1] * tens); + } + + doc_skip_ws(doc, &offset); + + cosobj = calloc(1, sizeof(struct cos_object)); + if (cosobj == NULL) { + return -1; /* memory error */ + } + + cosobj->type = COS_TYPE_INT; + cosobj->u.i = result; + + *cosobj_out = cosobj; + + *offset_out = offset; + + return 0; + } + num[len] = c - '0'; + offset++; + } + return -1; /* number too long */ } int cos_decode_string(struct pdf_doc *doc, @@ -459,19 +501,22 @@ int cos_decode_dictionary(struct pdf_doc *doc, return -1; /* memory error */ } cosobj->type = COS_TYPE_DICTIONARY; - + while ((DOC_BYTE(doc, offset) != '>') && (DOC_BYTE(doc, offset + 1) != '>')) { - + res = cos_decode_object(doc, &offset, &key); if (res != 0) { /* todo free up any dictionary entries already created */ + printf("key object decode failed\n"); return res; } if (key->type != COS_TYPE_NAME) { /* key value pairs without a name */ + printf("key was %d not a name %d\n", key->type, COS_TYPE_NAME); return -1; /* syntax error */ } + printf("key: %s\n", key->u.n); res = cos_decode_object(doc, &offset, &value); if (res != 0) { @@ -491,7 +536,7 @@ int cos_decode_dictionary(struct pdf_doc *doc, entry->next = cosobj->u.dictionary; cosobj->u.dictionary = entry; - + } offset += 2; /* skip closing >> */ doc_skip_ws(doc, &offset); @@ -511,6 +556,11 @@ int cos_decode_list(struct pdf_doc *doc, #define NAME_MAX_LENGTH 127 +/** + * decode a name object + * + * \todo deal with # symbols on pdf versions 1.2 and later + */ int cos_decode_name(struct pdf_doc *doc, uint64_t *offset_out, struct cos_object **cosobj_out) @@ -519,13 +569,31 @@ int cos_decode_name(struct pdf_doc *doc, struct cos_object *cosobj; uint8_t c; char name[NAME_MAX_LENGTH + 1]; + int idx = 0; offset = *offset_out; c = DOC_BYTE(doc, offset++); + if (c != '/') { + return -1; /* names must be prefixed with a / */ + } + printf("found a name\n"); + c = DOC_BYTE(doc, offset++); + while ((idx <= NAME_MAX_LENGTH) && + ((bclass[c] & (BC_WSPC | BC_DELM)) == 0)) { + //printf("%c", c); + name[idx++] = c; + c = DOC_BYTE(doc, offset++); + } + //printf("\nidx: %d\n", idx); + if (idx > NAME_MAX_LENGTH) { + /* name length exceeded implementation limit */ + return -1; + } + name[idx] = 0; - + //printf("name: %s\n", name); doc_skip_ws(doc, &offset); @@ -534,8 +602,8 @@ int cos_decode_name(struct pdf_doc *doc, return -1; /* memory error */ } - cosobj->type = COS_TYPE_BOOL; - cosobj->u.b = value; + cosobj->type = COS_TYPE_NAME; + cosobj->u.n = strdup(name); *cosobj_out = cosobj; @@ -553,7 +621,7 @@ int cos_decode_boolean(struct pdf_doc *doc, struct cos_object *cosobj; uint8_t c; bool value; - + offset = *offset_out; c = DOC_BYTE(doc, offset++); @@ -573,7 +641,7 @@ int cos_decode_boolean(struct pdf_doc *doc, return -1; /* syntax error */ } value = true; - + } else if ((c == 'f') || (c == 'F')) { /* false branch */ @@ -613,7 +681,7 @@ int cos_decode_boolean(struct pdf_doc *doc, *cosobj_out = cosobj; *offset_out = offset; - + return 0; } @@ -625,7 +693,7 @@ int cos_decode_null(struct pdf_doc *doc, uint64_t offset; struct cos_object *cosobj; uint8_t c; - + offset = *offset_out; c = DOC_BYTE(doc, offset++); @@ -644,7 +712,7 @@ int cos_decode_null(struct pdf_doc *doc, if ((c != 'l') && (c != 'L')) { return -1; /* syntax error */ } - + doc_skip_ws(doc, &offset); cosobj = calloc(1, sizeof(struct cos_object)); @@ -658,6 +726,83 @@ int cos_decode_null(struct pdf_doc *doc, return 0; } +/** + * attempt to decode the stream into a reference + * + * The stream has already had a positive integer decoded from it. if another + * positive integer follows and a R character after that it is a reference, + * otherwise bail, but not finding a ref is not an error! + * + * \param doc the pdf document + * \param offset_out offset of current cursor in stream + * \param cosobj_out the object to return into, on input contains the first + * integer + */ +int cos_attempt_decode_reference(struct pdf_doc *doc, + uint64_t *offset_out, + struct cos_object **cosobj_out) +{ + uint64_t offset; + struct cos_object *cosobj; /* possible generation object */ + uint8_t c; + int res; + struct cos_reference *nref; /* new reference */ + + offset = *offset_out; + + res = cos_decode_object(doc, &offset, &cosobj); + if (res != 0) { + return 0; /* no error if object could not be decoded */ + } + + if (cosobj->type != COS_TYPE_INT) { + /* next object was not an integer so not a reference */ + cos_free_object(cosobj); + return 0; + } + + if (cosobj->u.i < 0) { + /* integer was negative so not a reference (generations must be + * non-negative + */ + cos_free_object(cosobj); + return 0; + + } + + /* two int in a row, look for the R */ + c = DOC_BYTE(doc, offset++); + if (c != 'R') { + /* no R so not a reference */ + cos_free_object(cosobj); + return 0; + } + + /* found reference */ + + printf("found reference\n"); + doc_skip_ws(doc, &offset); + + nref = calloc(1, sizeof(struct cos_reference)); + if (nref == NULL) { + /* todo free objects */ + return -1; /* memory error */ + } + + nref->id = (*cosobj_out)->u.i; + nref->generation = cosobj->u.i; + + cos_free_object(*cosobj_out); + + cosobj->type = COS_TYPE_REFERENCE; + cosobj->u.reference = nref; + + *cosobj_out = cosobj; + + *offset_out = offset; + + return 0; +} /** * Decode input stream into an object @@ -678,7 +823,7 @@ int cos_decode_null(struct pdf_doc *doc, * [ a list * t|T boolean true * f|F boolean false - * n|N null + * n|N null * * Grammar is: * cos_object: @@ -711,12 +856,12 @@ int cos_decode_object(struct pdf_doc *doc, uint64_t offset; int res; struct cos_object *cosobj; - + offset = *offset_out; /* object could be any type use first char to try and select */ switch (DOC_BYTE(doc, offset)) { - + case '-': case '+': case '.': @@ -731,7 +876,12 @@ int cos_decode_object(struct pdf_doc *doc, case '8': case '9': res = cos_decode_number(doc, &offset, &cosobj); - /* if type is uint try to check for reference */ + /* if type is positive integer try to check for reference */ + if ((res == 0) && + (cosobj->type == COS_TYPE_INT) && + (cosobj->u.i > 0)) { + res = cos_attempt_decode_reference(doc, &offset, &cosobj); + } break; case '<': @@ -769,7 +919,13 @@ int cos_decode_object(struct pdf_doc *doc, default: res = -1; /* syntax error */ } - + + + if (res == 0) { + *cosobj_out = cosobj; + *offset_out = offset; + } + return res; } @@ -792,7 +948,7 @@ int decode_trailer(struct pdf_doc *doc, uint64_t offset) } offset += 7; doc_skip_ws(doc, &offset); - + res = cos_decode_object(doc, &offset, &trailer); if (res != 0) { return res; @@ -802,7 +958,7 @@ int decode_trailer(struct pdf_doc *doc, uint64_t offset) cos_free_object(trailer); return -1; } - + return 0; } @@ -812,9 +968,9 @@ int decode_xref(struct pdf_doc *doc, uint64_t *offset_out) uint64_t objnum; /* current object number */ uint64_t lastobjnum; uint64_t offset; - + offset = *offset_out; - + /* xref object header */ if ((DOC_BYTE(doc, offset ) != 'x') && (DOC_BYTE(doc, offset + 1) != 'r') && @@ -866,7 +1022,7 @@ int decode_xref(struct pdf_doc *doc, uint64_t *offset_out) objnum++; } // printf("at objnum %"PRIu64"\n", objnum); - + /* first object number in table */ res = doc_read_uint(doc, &offset, &objnum); } @@ -909,6 +1065,6 @@ int main(int argc, char **argv) printf("failed to decode trailer\n"); return res; } - + return 0; } -- cgit v1.2.3