From b1e0e4414ecd3161c0f947daceb8643b5889e51c Mon Sep 17 00:00:00 2001 From: Vincent Sanders Date: Thu, 18 Jan 2018 00:21:10 +0000 Subject: start to alter parseing to read from cos_stream object --- include/nspdf/document.h | 2 +- src/cos_object.c | 49 ++++++++++++++++++++++++++++++ src/cos_object.h | 51 ++++++++++++++++++++++++++++--- src/cos_parse.c | 79 ++++++++++++++++++++++++++++++++++-------------- src/cos_parse.h | 8 ++++- src/document.c | 13 ++++++-- src/page.c | 38 +++++++++-------------- src/pdf_doc.c | 14 ++++----- src/pdf_doc.h | 24 ++++++++++----- src/xref.c | 8 ++--- 10 files changed, 212 insertions(+), 74 deletions(-) diff --git a/include/nspdf/document.h b/include/nspdf/document.h index d7cbb0f..3c222cf 100644 --- a/include/nspdf/document.h +++ b/include/nspdf/document.h @@ -40,7 +40,7 @@ nspdferror nspdf_document_destroy(struct nspdf_doc *doc); * ready to render pages. The passed buffer ownership is transfered and must * not be altered untill the document is destroyed. */ -nspdferror nspdf_document_parse(struct nspdf_doc *doc, const uint8_t *buffer, uint64_t buffer_length); +nspdferror nspdf_document_parse(struct nspdf_doc *doc, const uint8_t *buffer, unsigned int buffer_length); #endif /* NSPDF_DOCUMENT_H_ */ diff --git a/src/cos_object.c b/src/cos_object.c index 3dc5efa..4398822 100644 --- a/src/cos_object.c +++ b/src/cos_object.c @@ -17,6 +17,7 @@ #include #include "cos_object.h" +#include "cos_parse.h" #include "pdf_doc.h" @@ -368,6 +369,54 @@ cos_get_stream(struct nspdf_doc *doc, } +/* + * get object from object reference + */ +nspdferror +cos_get_object(struct nspdf_doc *doc, + struct cos_object *cobj, + struct cos_object **value_out) +{ + nspdferror res; + res = nspdf__xref_get_referenced(doc, &cobj); + if (res == NSPDFERROR_OK) { + *value_out = cobj; + } + return res; +} + + +nspdferror +cos_get_content(struct nspdf_doc *doc, + struct cos_object *cobj, + struct cos_content **content_out) +{ + nspdferror res; + struct cos_object *content_obj; + + res = nspdf__xref_get_referenced(doc, &cobj); + if (res == NSPDFERROR_OK) { + if (cobj->type == COS_TYPE_STREAM) { + res = cos_parse_content_stream(doc, cobj->u.stream, &content_obj); + if (res == NSPDFERROR_OK) { + /* replace stream object with parsed content operations */ + struct cos_object tmpobj; + tmpobj = *cobj; + *cobj = *content_obj; + *content_obj = tmpobj; + cos_free_object(content_obj); + + *content_out = cobj->u.content; + } + } else if (cobj->type == COS_TYPE_CONTENT) { + *content_out = cobj->u.content; + } else { + res = NSPDFERROR_TYPE; + } + } + return res; +} + /* * get a value for a key from a dictionary */ diff --git a/src/cos_object.h b/src/cos_object.h index 2e763e2..9b98694 100644 --- a/src/cos_object.h +++ b/src/cos_object.h @@ -16,6 +16,7 @@ #define NSPDF__COS_OBJECT_H_ struct nspdf_doc; +struct content_operation; enum cos_type { COS_TYPE_NULL, /* 0 */ @@ -30,6 +31,7 @@ enum cos_type { COS_TYPE_NUMBERTREE, COS_TYPE_STREAM, COS_TYPE_REFERENCE, /* 11 */ + COS_TYPE_CONTENT, /* 12 - parsed content stream */ }; struct cos_object; @@ -59,10 +61,13 @@ struct cos_array { struct cos_object **values; }; +/** + * COS string data + */ struct cos_string { - uint8_t *data; /**< string data */ - size_t length; /**< string length */ + unsigned int length; /**< string length */ size_t alloc; /**< memory allocation for string */ + uint8_t *data; /**< string data */ }; struct cos_reference { @@ -71,12 +76,22 @@ struct cos_reference { }; struct cos_stream { - const uint8_t *data; /**< decoded stream data */ - int64_t length; /**< decoded stream length */ + unsigned int length; /**< decoded stream length */ size_t alloc; /**< memory allocated for stream */ + const uint8_t *data; /**< decoded stream data */ }; +/** + * Synthetic parsed content object. + * + */ +struct cos_content { + unsigned int length; /**< number of content operations */ + unsigned int alloc; /**< number of allocated operations */ + struct content_operation *operations; +}; + struct cos_object { int type; union { @@ -107,6 +122,8 @@ struct cos_object { /** reference */ struct cos_reference *reference; + /** parsed content stream */ + struct cos_content *content; } u; }; @@ -264,5 +281,31 @@ nspdferror cos_get_array(struct nspdf_doc *doc, struct cos_object *cobj, struct */ nspdferror cos_get_stream(struct nspdf_doc *doc, struct cos_object *cobj, struct cos_stream **stream_out); +/** + * get a direct cos object. + * + * Obtain a direct object if the passed object was a reference it is + * dereferenced from the cross reference table. + * + * \param doc The document the cos object belongs to. + * \param cobj A cos object. + * \param object_out The result object. + * \return NSERROR_OK and \p object_out updated, + */ +nspdferror cos_get_object(struct nspdf_doc *doc, struct cos_object *cobj, struct cos_object **object_out); + +/** + * get a parsed content object + * + * Get the parsed content from a cos object, if the object is an object + * reference it will be dereferenced first. + * The parsed content object is *not* a normal COS object rather it is the + * internal result of parsing a PDF content stream. + * This object type is used to replace the stream object in the cross reference + * table after its initial parse to avoid the need to keep and repeatedly + * parse the filtered stream data. + * + */ +nspdferror cos_get_content(struct nspdf_doc *doc, struct cos_object *cobj, struct cos_content **content_out); #endif diff --git a/src/cos_parse.c b/src/cos_parse.c index 21ba0d7..c196019 100644 --- a/src/cos_parse.c +++ b/src/cos_parse.c @@ -57,11 +57,15 @@ static uint8_t xtoi(uint8_t x) return x; } +/** + * parse a number + */ static nspdferror -cos_decode_number(struct nspdf_doc *doc, - uint64_t *offset_out, - struct cos_object **cosobj_out) +cos_parse_number(struct cos_stream *stream, + uint64_t *offset_out, + struct cos_object **cosobj_out) { + nspdferror res; struct cos_object *cosobj; uint8_t c; /* current byte from source data */ unsigned int len; /* number of decimal places in number */ @@ -71,7 +75,7 @@ cos_decode_number(struct nspdf_doc *doc, offset = *offset_out; for (len = 0; len < sizeof(num); len++) { - c = DOC_BYTE(doc, offset); + c = stream_byte(stream, offset); if ((bclass[c] & BC_DCML) != BC_DCML) { int64_t result = 0; /* parsed result */ uint64_t tens; @@ -85,7 +89,10 @@ cos_decode_number(struct nspdf_doc *doc, result += (num[len - 1] * tens); } - doc_skip_ws(doc, &offset); + res = nspdf__stream_skip_ws(stream, &offset); + if (res != NSPDFERROR_OK) { + return res; + } cosobj = calloc(1, sizeof(struct cos_object)); if (cosobj == NULL) { @@ -230,7 +237,7 @@ cos_decode_string(struct nspdf_doc *doc, cos_string_append(cstring, c); } - doc_skip_ws(doc, &offset); + nspdf__stream_skip_ws(doc->stream, &offset); *cosobj_out = cosobj; *offset_out = offset; @@ -279,7 +286,7 @@ cos_decode_hex_string(struct nspdf_doc *doc, cos_string_append(cstring, value); } offset++; - doc_skip_ws(doc, &offset); + nspdf__stream_skip_ws(doc->stream, &offset); *cosobj_out = cosobj; *offset_out = offset; @@ -323,7 +330,7 @@ cos_decode_dictionary(struct nspdf_doc *doc, return NSPDFERROR_SYNTAX; /* syntax error */ } offset += 2; - doc_skip_ws(doc, &offset); + nspdf__stream_skip_ws(doc->stream, &offset); //printf("found a dictionary\n"); @@ -371,7 +378,7 @@ cos_decode_dictionary(struct nspdf_doc *doc, } offset += 2; /* skip closing >> */ - doc_skip_ws(doc, &offset); + nspdf__stream_skip_ws(doc->stream, &offset); *cosobj_out = cosobj; *offset_out = offset; @@ -403,7 +410,7 @@ cos_parse_list(struct nspdf_doc *doc, offset++; /* advance offset to next token */ - res = doc_skip_ws(doc, &offset); + res = nspdf__stream_skip_ws(doc->stream, &offset); if (res != NSPDFERROR_OK) { return res; } @@ -449,7 +456,7 @@ cos_parse_list(struct nspdf_doc *doc, } offset++; /* skip closing ] */ - doc_skip_ws(doc, &offset); + nspdf__stream_skip_ws(doc->stream, &offset); *cosobj_out = cosobj; *offset_out = offset; @@ -499,7 +506,7 @@ cos_decode_name(struct nspdf_doc *doc, //printf("name: %s\n", name); - doc_skip_ws(doc, &offset); + nspdf__stream_skip_ws(doc->stream, &offset); cosobj = calloc(1, sizeof(struct cos_object)); if (cosobj == NULL) { @@ -575,7 +582,7 @@ cos_decode_boolean(struct nspdf_doc *doc, return -1; /* syntax error */ } - doc_skip_ws(doc, &offset); + nspdf__stream_skip_ws(doc->stream, &offset); cosobj = calloc(1, sizeof(struct cos_object)); if (cosobj == NULL) { @@ -626,7 +633,7 @@ cos_decode_null(struct nspdf_doc *doc, return -1; /* syntax error */ } - doc_skip_ws(doc, &offset); + nspdf__stream_skip_ws(doc->stream, &offset); cosobj = calloc(1, sizeof(struct cos_object)); if (cosobj == NULL) { @@ -656,6 +663,7 @@ cos_parse_stream(struct nspdf_doc *doc, uint64_t offset; struct cos_object *stream_filter; struct cos_stream *stream; + int64_t stream_length; offset = *offset_out; stream_dict = *cosobj_out; @@ -678,7 +686,7 @@ cos_parse_stream(struct nspdf_doc *doc, //printf("detected stream\n"); /* parsed object was a dictionary and there is a stream marker */ - res = doc_skip_ws(doc, &offset); + res = nspdf__stream_skip_ws(doc->stream, &offset); if (res != NSPDFERROR_OK) { return res; } @@ -688,10 +696,15 @@ cos_parse_stream(struct nspdf_doc *doc, return NSPDFERROR_NOMEM; } - res = cos_get_dictionary_int(doc, stream_dict, "Length", &stream->length); + res = cos_get_dictionary_int(doc, stream_dict, "Length", &stream_length); if (res != NSPDFERROR_OK) { return res; } + if (stream_length < 0) { + return NSPDFERROR_RANGE; + } + stream->length = stream_length; + //printf("stream length %d\n", stream_length); stream->data = doc->start + offset; stream->alloc = 0; /* stream is pointing at non malloced data */ @@ -699,7 +712,7 @@ cos_parse_stream(struct nspdf_doc *doc, offset += stream->length; /* possible whitespace after stream data */ - res = doc_skip_ws(doc, &offset); + res = nspdf__stream_skip_ws(doc->stream, &offset); if (res != NSPDFERROR_OK) { return res; } @@ -719,7 +732,7 @@ cos_parse_stream(struct nspdf_doc *doc, offset += 9; //printf("detected endstream\n"); - res = doc_skip_ws(doc, &offset); + res = nspdf__stream_skip_ws(doc->stream, &offset); if (res != NSPDFERROR_OK) { return res; } @@ -785,7 +798,7 @@ cos_attempt_decode_reference(struct nspdf_doc *doc, offset = *offset_out; - res = cos_decode_number(doc, &offset, &generation); + res = cos_parse_number(doc->stream, &offset, &generation); if (res != NSPDFERROR_OK) { /* no error if next token could not be decoded as a number */ return NSPDFERROR_OK; @@ -813,7 +826,7 @@ cos_attempt_decode_reference(struct nspdf_doc *doc, //printf("found object reference\n"); offset ++; - doc_skip_ws(doc, &offset); + nspdf__stream_skip_ws(doc->stream, &offset); nref = calloc(1, sizeof(struct cos_reference)); if (nref == NULL) { @@ -839,7 +852,7 @@ cos_attempt_decode_reference(struct nspdf_doc *doc, //printf("indirect\n"); offset += 3; - res = doc_skip_ws(doc, &offset); + res = nspdf__stream_skip_ws(doc->stream, &offset); if (res != NSPDFERROR_OK) { cos_free_object(generation); return res; @@ -880,7 +893,7 @@ cos_attempt_decode_reference(struct nspdf_doc *doc, offset += 6; //printf("endobj\n"); - res = doc_skip_ws(doc, &offset); + res = nspdf__stream_skip_ws(doc->stream, &offset); if (res != NSPDFERROR_OK) { cos_free_object(indirect); cos_free_object(generation); @@ -980,7 +993,7 @@ cos_parse_object(struct nspdf_doc *doc, case '7': case '8': case '9': - res = cos_decode_number(doc, &offset, &cosobj); + res = cos_parse_number(doc->stream, &offset, &cosobj); /* if type is positive integer try to check for reference */ if ((res == 0) && (cosobj->type == COS_TYPE_INT) && @@ -1032,3 +1045,23 @@ cos_parse_object(struct nspdf_doc *doc, return res; } + +nspdferror +cos_parse_content_stream(struct nspdf_doc *doc, + struct cos_stream *stream, + struct cos_object **content_out) +{ + struct cos_object *cosobj; + + printf("%.*s", (int)stream->length, stream->data); + + cosobj = calloc(1, sizeof(struct cos_object)); + if (cosobj == NULL) { + return NSPDFERROR_NOMEM; + } + cosobj->type = COS_TYPE_CONTENT; + + *content_out = cosobj; + + return NSPDFERROR_OK; +} diff --git a/src/cos_parse.h b/src/cos_parse.h index 0bca79f..8f48108 100644 --- a/src/cos_parse.h +++ b/src/cos_parse.h @@ -17,12 +17,18 @@ struct nspdf_doc; struct cos_object; +struct cos_stream; /** - * Decode input stream into an object + * Parse input stream into an object * * lex and parse a byte stream to generate a COS object. */ nspdferror cos_parse_object(struct nspdf_doc *doc, uint64_t *offset_out, struct cos_object **cosobj_out); +/** + * Parse content stream into content operations object + */ +nspdferror cos_parse_content_stream(struct nspdf_doc *doc, struct cos_stream *stream, struct cos_object **content_out); + #endif diff --git a/src/document.c b/src/document.c index bbe948d..b7a36d2 100644 --- a/src/document.c +++ b/src/document.c @@ -92,7 +92,7 @@ decode_startxref(struct nspdf_doc *doc, } offset += 9; - res = doc_skip_ws(doc, &offset); + res = nspdf__stream_skip_ws(doc->stream, &offset); if (res != NSPDFERROR_OK) { return res; } @@ -168,7 +168,7 @@ decode_trailer(struct nspdf_doc *doc, return -1; } offset += 7; - doc_skip_ws(doc, &offset); + nspdf__stream_skip_ws(doc->stream, &offset); res = cos_parse_object(doc, &offset, &trailer); if (res != 0) { @@ -422,13 +422,20 @@ static nspdferror check_header(struct nspdf_doc *doc) nspdferror nspdf_document_parse(struct nspdf_doc *doc, const uint8_t *buffer, - uint64_t buffer_length) + unsigned int buffer_length) { nspdferror res; doc->start = buffer; doc->length = buffer_length; + doc->stream = calloc(1, sizeof(struct cos_stream)); + if (doc->stream == NULL) { + return NSPDFERROR_NOMEM; + } + doc->stream->data = buffer; + doc->stream->length = buffer_length; + res = check_header(doc); if (res != 0) { printf("header check failed\n"); diff --git a/src/page.c b/src/page.c index acc97d7..7b6bee8 100644 --- a/src/page.c +++ b/src/page.c @@ -148,10 +148,18 @@ nspdf_page_count(struct nspdf_doc *doc, unsigned int *pages_out) static nspdferror nspdf__render_content_stream(struct nspdf_doc *doc, - struct cos_stream *content_stream) + struct page_table_entry *page_entry, + struct cos_object *content_entry) { - printf("%.*s", (int)content_stream->length, content_stream->data); - return NSPDFERROR_OK; + nspdferror res; + struct cos_content *content_operations; + + res = cos_get_content(doc, content_entry, &content_operations); + if (res == NSPDFERROR_OK) { + printf("%p", content_operations); + } + + return res; } /* exported interface documented in nspdf/page.h */ @@ -160,7 +168,6 @@ nspdf_page_render(struct nspdf_doc *doc, unsigned int page_number) { struct page_table_entry *page_entry; struct cos_object *content_array; - struct cos_stream *content_stream; nspdferror res; page_entry = doc->page_table + page_number; @@ -184,31 +191,16 @@ nspdf_page_render(struct nspdf_doc *doc, unsigned int page_number) content_stream_index, &content_entry); if (res != NSPDFERROR_OK) { - return res; - } - - res = cos_get_stream(doc, content_entry, &content_stream); - if (res != NSPDFERROR_OK) { - return res; + break; } - res = nspdf__render_content_stream(doc, content_stream); + res = nspdf__render_content_stream(doc, page_entry, content_entry); if (res != NSPDFERROR_OK) { - return res; + break; } } } else if (res == NSPDFERROR_TYPE) { - res = cos_get_stream(doc, page_entry->contents, &content_stream); - if (res != NSPDFERROR_OK) { - return res; - } - - res = nspdf__render_content_stream(doc, content_stream); - if (res != NSPDFERROR_OK) { - return res; - } - } else { - return res; + res = nspdf__render_content_stream(doc, page_entry, page_entry->contents); } return res; diff --git a/src/pdf_doc.c b/src/pdf_doc.c index 997a3d7..955f737 100644 --- a/src/pdf_doc.c +++ b/src/pdf_doc.c @@ -19,29 +19,27 @@ #include "cos_object.h" #include "pdf_doc.h" -/** - * move offset to next non whitespace byte - */ -nspdferror doc_skip_ws(struct nspdf_doc *doc, uint64_t *offset) +nspdferror nspdf__stream_skip_ws(struct cos_stream *stream, uint64_t *offset) { uint8_t c; /* TODO sort out keeping offset in range */ - c = DOC_BYTE(doc, *offset); + c = stream_byte(stream, *offset); while ((bclass[c] & (BC_WSPC | BC_CMNT) ) != 0) { (*offset)++; /* skip comments */ if ((bclass[c] & BC_CMNT) != 0) { - c = DOC_BYTE(doc, *offset); + c = stream_byte(stream, *offset); while ((bclass[c] & BC_EOLM ) == 0) { (*offset)++; - c = DOC_BYTE(doc, *offset); + c = stream_byte(stream, *offset); } } - c = DOC_BYTE(doc, *offset); + c = stream_byte(stream, *offset); } return NSPDFERROR_OK; } + /** * move offset to next non eol byte */ diff --git a/src/pdf_doc.h b/src/pdf_doc.h index e362ea6..27a730a 100644 --- a/src/pdf_doc.h +++ b/src/pdf_doc.h @@ -18,11 +18,18 @@ struct xref_table_entry; struct page_table_entry; -/** pdf document */ +/** + * pdf document + */ struct nspdf_doc { const uint8_t *start; /* start of pdf document in input stream */ - uint64_t length; + unsigned int length; + + /** + * input data stream + */ + struct cos_stream *stream; int major; int minor; @@ -46,8 +53,14 @@ struct nspdf_doc { /* byte data acessory, allows for more complex buffer handling in future */ #define DOC_BYTE(doc, offset) (doc->start[(offset)]) +static inline uint8_t +stream_byte(struct cos_stream *stream, unsigned int offset) +{ + return *(stream->data + offset); +} + /* helpers in pdf_doc.c */ -nspdferror doc_skip_ws(struct nspdf_doc *doc, uint64_t *offset); +nspdferror nspdf__stream_skip_ws(struct cos_stream *stream, uint64_t *offset); nspdferror doc_skip_eol(struct nspdf_doc *doc, uint64_t *offset); nspdferror doc_read_uint(struct nspdf_doc *doc, uint64_t *offset_out, uint64_t *result_out); @@ -68,9 +81,6 @@ nspdferror nspdf__xref_allocate(struct nspdf_doc *doc, int64_t size); nspdferror nspdf__decode_page_tree(struct nspdf_doc *doc, struct cos_object *page_tree_node, unsigned int *page_index); /* cos stream filters */ -nspdferror -nspdf__cos_stream_filter(struct nspdf_doc *doc, - const char *filter_name, - struct cos_stream **stream_out); +nspdferror nspdf__cos_stream_filter(struct nspdf_doc *doc, const char *filter_name, struct cos_stream **stream_out); #endif diff --git a/src/xref.c b/src/xref.c index 298c750..2fb9301 100644 --- a/src/xref.c +++ b/src/xref.c @@ -68,7 +68,7 @@ nspdferror nspdf__xref_parse(struct nspdf_doc *doc, uint64_t *offset_out) } offset += 4; - res = doc_skip_ws(doc, &offset); + res = nspdf__stream_skip_ws(doc->stream, &offset); if (res != NSPDFERROR_OK) { return res; } @@ -79,7 +79,7 @@ nspdferror nspdf__xref_parse(struct nspdf_doc *doc, uint64_t *offset_out) res = doc_read_uint(doc, &offset, &objnumber); while (res == NSPDFERROR_OK) { uint64_t lastobj; - res = doc_skip_ws(doc, &offset); + res = nspdf__stream_skip_ws(doc->stream, &offset); if (res != NSPDFERROR_OK) { return res; } @@ -89,7 +89,7 @@ nspdferror nspdf__xref_parse(struct nspdf_doc *doc, uint64_t *offset_out) return res; } - res = doc_skip_ws(doc, &offset); + res = nspdf__stream_skip_ws(doc->stream, &offset); if (res != NSPDFERROR_OK) { return res; } @@ -169,7 +169,7 @@ nspdf__xref_get_referenced(struct nspdf_doc *doc, struct cos_object **cobj_out) } if (entry->object == NULL) { - /* indirect object has never been decoded */ + /* indirect object has never been parsed */ offset = entry->offset; res = cos_parse_object(doc, &offset, &indirect); if (res != NSPDFERROR_OK) { -- cgit v1.2.3