From e35135faa55d2b89a08f8fb9b55ab01d5079ed50 Mon Sep 17 00:00:00 2001 From: Florian Kinder Date: Tue, 23 Jun 2026 11:53:11 +0200 Subject: [PATCH 1/2] Cover error paths to lift library coverage past 90% Characterization tests for untrusted-input error handling across the library packages (87.1% -> 90.1% statements); no production code changed. - reader: typed Resolve* type errors, OpenFile failures, rich DocumentInfo fields, Objects iteration, dangling-reference null, xrefFormat - parse: malformed-token error branches (stray/keyword/unterminated/bad key) - filter: /F+/DP abbreviations, malformed and unsupported filter chains - contentstream: inline-image errors, stray delimiters, readDict errors, trailing-operand drop, All early-break, fake-EI scan - crypt: unsupported /V, /V5 via New, Identity stream override - object: typed-getter misses; dump: looksLikeText, xrefFormat --- contentstream/scanner_test.go | 75 ++++++++++++++++ filter_test.go | 47 ++++++++++ internal/crypt/crypt_test.go | 60 +++++++++++++ object_test.go | 56 ++++++++++++ parse_test.go | 22 +++++ reader_test.go | 158 ++++++++++++++++++++++++++++++++++ text_test.go | 29 +++++++ 7 files changed, 447 insertions(+) diff --git a/contentstream/scanner_test.go b/contentstream/scanner_test.go index da7b59f..a598588 100644 --- a/contentstream/scanner_test.go +++ b/contentstream/scanner_test.go @@ -389,3 +389,78 @@ func TestOperandIntEdgeCases(t *testing.T) { t.Error("overflowing integer literal yielded an int") } } + +// readInlineImage must error, not panic, on a malformed BI block. +func TestInlineImageErrors(t *testing.T) { + for _, src := range []string{ + "BI /W 1", // EOF before ID + "BI 5 ID x EI", // non-name key before ID + "BI /W ] ID x EI", // bad entry value + "BI /W 1 ID abcdefg", // no EI terminator before EOF + } { + t.Run(src, func(t *testing.T) { + if _, err := contentstream.New([]byte(src)).Next(); err == nil { + t.Fatal("expected an error, got nil") + } + }) + } +} + +func TestNextStrayDelimiter(t *testing.T) { + for _, src := range []string{"]", ">>"} { + if _, err := contentstream.New([]byte(src)).Next(); err == nil { + t.Errorf("%q: expected an error, got nil", src) + } + } +} + +// Breaking out of the All range mid-iteration must stop cleanly (the +// yield-returned-false path). +func TestAllEarlyBreak(t *testing.T) { + sc := contentstream.New([]byte("q Q q")) + n := 0 + for op, err := range sc.All() { + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + _ = op + n++ + break + } + if n != 1 { + t.Fatalf("iterated %d ops, want 1 before break", n) + } +} + +// Operands left on the stack at EOF with no trailing operator are dropped, not +// emitted as a bogus op. +func TestTrailingOperandsDropped(t *testing.T) { + if ops := collect(t, "1 2 3"); len(ops) != 0 { + t.Fatalf("got %d ops, want 0 (operands without an operator are dropped)", len(ops)) + } +} + +func TestReadDictStructuralErrors(t *testing.T) { + for _, src := range []string{ + "/X << 1 2 >> BDC", // key is not a name + "/X << /K 1", // EOF before '>>' + } { + t.Run(src, func(t *testing.T) { + if _, err := contentstream.New([]byte(src)).Next(); err == nil { + t.Fatal("expected an error, got nil") + } + }) + } +} + +// An "EI" that appears in the image data without a whitespace boundary must be +// skipped; scanning continues to the real, whitespace-delimited terminator. +func TestInlineImageFakeEIInData(t *testing.T) { + ops := collect(t, "BI ID aEIb EI Q") + if len(ops) < 1 || ops[0].Operator != "EI" { + t.Fatalf("want EI op first, got %+v", ops) + } + if string(ops[0].Image) != "aEIb" { + t.Errorf("image = %q, want aEIb", ops[0].Image) + } +} diff --git a/filter_test.go b/filter_test.go index 6ac7cdf..352a264 100644 --- a/filter_test.go +++ b/filter_test.go @@ -130,3 +130,50 @@ func TestParamsFromDict(t *testing.T) { t.Errorf("empty dict = %+v, want zero Params", empty) } } + +// /F and /DP are the /Filter and /DecodeParms abbreviations; a null /DP entry +// leaves that filter with default params. +func TestStreamFilterChainAbbreviations(t *testing.T) { + orig := []byte("abbreviated filter keys decode the same") + var fl bytes.Buffer + zw := zlib.NewWriter(&fl) + zw.Write(orig) + zw.Close() + a85 := make([]byte, ascii85.MaxEncodedLen(fl.Len())) + n := ascii85.Encode(a85, fl.Bytes()) + stream := append(a85[:n:n], '~', '>') + + got := streamObject3(t, buildStreamObjectPDF(t, + "/F [ /ASCII85Decode /FlateDecode ] /DP [ null null ]", stream)) + if !bytes.Equal(got, orig) { + t.Fatalf("abbreviated /F+/DP decode = %q, want %q", got, orig) + } +} + +// A malformed or unsupported filter chain must surface an error from Content(), +// not panic. +func TestStreamFilterChainErrors(t *testing.T) { + cases := map[string]string{ + "filter_wrong_type": "/Filter 42", + "filter_array_bad_entry": "/Filter [ /FlateDecode 42 ]", + "image_only_filter": "/Filter /DCTDecode", + "undecodable_data": "/Filter /FlateDecode", + } + for name, dictEntries := range cases { + t.Run(name, func(t *testing.T) { + data := buildStreamObjectPDF(t, dictEntries, []byte("not valid filtered data")) + r, err := Open(bytes.NewReader(data)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer r.Close() + v, err := r.Resolve(Reference{Number: 3, Generation: 0}) + if err != nil { + t.Fatalf("Resolve: %v", err) + } + if _, err := v.(*Stream).Content(); err == nil { + t.Fatal("expected a Content() error, got nil") + } + }) + } +} diff --git a/internal/crypt/crypt_test.go b/internal/crypt/crypt_test.go index 33a9575..bbcfdcd 100644 --- a/internal/crypt/crypt_test.go +++ b/internal/crypt/crypt_test.go @@ -371,3 +371,63 @@ func TestComputeV5KeyWrongPassword(t *testing.T) { t.Fatal("expected an error for a non-matching password, got nil") } } + +// New must reject an unsupported /V rather than returning a zero handler. +func TestNewUnsupportedVersion(t *testing.T) { + for _, v := range []int{0, 3, 99} { + if _, err := New(Params{V: v}, nil); err == nil { + t.Errorf("New(/V %d) should error", v) + } + } +} + +// New drives the /V 5 branch end to end (AES-256 key derivation + algorithm +// selection), not just computeV5Key in isolation. +func TestNewV5(t *testing.T) { + const R = 6 + password := []byte("v5-user") + fileKey := bytes.Repeat([]byte{0x42}, 32) + uVS := bytes.Repeat([]byte{0x01}, 8) + uKS := bytes.Repeat([]byte{0x02}, 8) + uValHash, err := v5Hash(password, uVS, nil, R) + if err != nil { + t.Fatalf("v5Hash(validation): %v", err) + } + kHash, err := v5Hash(password, uKS, nil, R) + if err != nil { + t.Fatalf("v5Hash(key): %v", err) + } + ue := aesCBCEncryptRaw(t, kHash, make([]byte, aes.BlockSize), fileKey) + userEntry := append(append(append([]byte{}, uValHash...), uVS...), uKS...) + + h, err := New(Params{ + V: 5, R: R, + UserEntry: userEntry, + OwnerEntry: make([]byte, 48), + UE: ue, + OE: make([]byte, 32), + }, password) + if err != nil { + t.Fatalf("New(V5): %v", err) + } + if !bytes.Equal(h.FileKey, fileKey) { + t.Fatal("V5 file key mismatch") + } + if h.StreamAlg != AlgAES256 { + t.Errorf("StreamAlg = %v, want AlgAES256", h.StreamAlg) + } +} + +// A per-stream Identity crypt filter overrides the default algorithm and passes +// the bytes through unchanged. +func TestDecryptStreamIdentityOverride(t *testing.T) { + h := &Handler{StreamAlg: AlgRC4, FileKey: bytes.Repeat([]byte{1}, 16)} + data := []byte("plaintext") + out, err := h.DecryptStream(data, 1, 0, "Identity") + if err != nil { + t.Fatalf("DecryptStream: %v", err) + } + if !bytes.Equal(out, data) { + t.Errorf("Identity override = %q, want %q", out, data) + } +} diff --git a/object_test.go b/object_test.go index 43fbd5b..99eb64e 100644 --- a/object_test.go +++ b/object_test.go @@ -121,3 +121,59 @@ func TestDictNilReceiver(t *testing.T) { t.Error("Iter on nil yielded an entry") } } + +// The typed getters must miss (ok=false) — not coerce or fabricate a zero — on +// a missing key, a wrong type, or an unresolvable Reference (nil reader). +func TestDictTypedGetterMisses(t *testing.T) { + d := newDict(nil) + d.set("i", Integer(5)) + d.set("b", Bool(true)) + d.set("s", String("hi")) + + wrongType := []struct { + name string + ok bool + }{ + {"Bool", func() bool { _, ok := d.Bool("i"); return ok }()}, + {"Int", func() bool { _, ok := d.Int("b"); return ok }()}, + {"Array", func() bool { _, ok := d.Array("i"); return ok }()}, + {"Dict", func() bool { _, ok := d.Dict("i"); return ok }()}, + {"Stream", func() bool { _, ok := d.Stream("i"); return ok }()}, + {"String", func() bool { _, ok := d.String("i"); return ok }()}, + {"Bytes", func() bool { _, ok := d.Bytes("i"); return ok }()}, + } + for _, tc := range wrongType { + if tc.ok { + t.Errorf("%s on a wrong-typed value should miss", tc.name) + } + } + + missing := []bool{ + func() bool { _, ok := d.Int("x"); return ok }(), + func() bool { _, ok := d.Bool("x"); return ok }(), + func() bool { _, ok := d.Array("x"); return ok }(), + func() bool { _, ok := d.Dict("x"); return ok }(), + func() bool { _, ok := d.Stream("x"); return ok }(), + func() bool { _, ok := d.String("x"); return ok }(), + func() bool { _, ok := d.Bytes("x"); return ok }(), + } + for i, ok := range missing { + if ok { + t.Errorf("getter %d on a missing key should miss", i) + } + } + + // A Reference with no backing reader can't be dereferenced. + d.set("ref", Reference{Number: 9}) + if _, ok := d.Int("ref"); ok { + t.Error("Reference with nil reader should miss") + } + + // Controls: the right type resolves. + if v, ok := d.Int("i"); !ok || v != 5 { + t.Errorf("Int(i) = %d, %v; want 5, true", v, ok) + } + if s, ok := d.Bytes("s"); !ok || string(s) != "hi" { + t.Errorf("Bytes(s) = %q, %v; want hi, true", s, ok) + } +} diff --git a/parse_test.go b/parse_test.go index d657b1a..1aae9cd 100644 --- a/parse_test.go +++ b/parse_test.go @@ -5,6 +5,8 @@ import ( "fmt" "strings" "testing" + + "github.com/speedata/pdfdisassembler/internal/lex" ) // buildPDFWithObjectBody puts body as object 3 in a minimal classical-xref PDF. @@ -68,3 +70,23 @@ func TestModeratelyNestedArrayResolves(t *testing.T) { t.Fatalf("got %T, want Array", obj) } } + +// Malformed token streams must error, never panic (cases labelled inline). +func TestParseObjectErrors(t *testing.T) { + for _, src := range []string{ + "", // EOF where an object is expected + "]", // stray ArrayEnd + ">>", // stray DictEnd + "foo", // unexpected keyword + "[ 1 2", // unterminated array + "<< /K 1", // unterminated dict + "<< 1 2 >>", // dict key is not a name + } { + t.Run(src, func(t *testing.T) { + p := newParser(lex.New([]byte(src)), nil) + if _, err := p.parseObject(); err == nil { + t.Fatal("expected an error, got nil") + } + }) + } +} diff --git a/reader_test.go b/reader_test.go index 1a9413d..13985c2 100644 --- a/reader_test.go +++ b/reader_test.go @@ -463,3 +463,161 @@ func TestStreamLengthRejectsBadLength(t *testing.T) { } }) } + +// A bare Reader works because a non-Reference value resolves to itself; each +// helper must error on the wrong type, not pass back a zero value as success. +func TestResolveHelperTypeErrors(t *testing.T) { + r := &Reader{} + errCases := []struct { + name string + call func() error + }{ + {"dict_from_int", func() error { _, e := r.ResolveDict(Integer(1)); return e }}, + {"dict_from_null", func() error { _, e := r.ResolveDict(Null{}); return e }}, + {"dict_from_nil", func() error { _, e := r.ResolveDict(nil); return e }}, + {"bool_from_int", func() error { _, e := r.ResolveBool(Integer(1)); return e }}, + {"int_from_bool", func() error { _, e := r.ResolveInt(Bool(true)); return e }}, + {"array_from_int", func() error { _, e := r.ResolveArray(Integer(1)); return e }}, + {"stream_from_int", func() error { _, e := r.DecodeStream(Integer(1)); return e }}, + } + for _, tc := range errCases { + if tc.call() == nil { + t.Errorf("%s: expected an error, got nil", tc.name) + } + } + // Controls: the right type resolves cleanly. + if b, err := r.ResolveBool(Bool(true)); err != nil || !b { + t.Errorf("ResolveBool(true) = %v, %v", b, err) + } + if n, err := r.ResolveInt(Integer(7)); err != nil || n != 7 { + t.Errorf("ResolveInt(7) = %v, %v", n, err) + } + if a, err := r.ResolveArray(Array{Integer(1)}); err != nil || len(a) != 1 { + t.Errorf("ResolveArray = %v, %v", a, err) + } +} + +// OpenFile must surface the os.Open error for a missing path, and must not leak +// the descriptor when the file opens but doesn't parse as a PDF. +func TestOpenFileErrors(t *testing.T) { + if _, err := OpenFile(filepath.Join(t.TempDir(), "missing.pdf")); err == nil { + t.Error("OpenFile(missing) should error") + } + bad := filepath.Join(t.TempDir(), "bad.pdf") + if err := os.WriteFile(bad, []byte("not a pdf"), 0o644); err != nil { + t.Fatal(err) + } + if _, err := OpenFile(bad); err == nil { + t.Error("OpenFile(garbage) should error") + } +} + +func TestXrefFormat(t *testing.T) { + withTrailer := func(set func(d *Dict)) *Reader { + d := newDict(nil) + set(d) + return &Reader{trailer: d} + } + if got := (&Reader{}).xrefFormat(); got != "unknown" { + t.Errorf("nil trailer = %q, want unknown", got) + } + if got := withTrailer(func(d *Dict) { d.set("Type", Name("XRef")) }).xrefFormat(); got != "stream" { + t.Errorf("/Type /XRef = %q, want stream", got) + } + if got := withTrailer(func(d *Dict) { d.set("XRefStm", Integer(99)) }).xrefFormat(); got != "hybrid" { + t.Errorf("/XRefStm = %q, want hybrid", got) + } + if got := withTrailer(func(d *Dict) { d.set("Size", Integer(4)) }).xrefFormat(); got != "classical" { + t.Errorf("plain trailer = %q, want classical", got) + } +} + +// Non-standard /Info keys land in Custom; a non-string entry is skipped, not +// rendered. +func TestDocumentInfoRichFields(t *testing.T) { + var buf bytes.Buffer + off := func() int { return buf.Len() } + fmt.Fprint(&buf, "%PDF-1.7\n%\xE2\xE3\xCF\xD3\n") + offsets := make([]int, 4) + offsets[1] = off() + fmt.Fprint(&buf, "1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n") + offsets[2] = off() + fmt.Fprint(&buf, "2 0 obj\n<< /Type /Pages /Count 0 /Kids [] >>\nendobj\n") + offsets[3] = off() + fmt.Fprint(&buf, "3 0 obj\n<< /Author (Ada) /Subject (Math) /Keywords (a,b) "+ + "/Creator (X) /Producer (Y) /CreationDate (D:20200102030405Z) "+ + "/ModDate (D:20210102030405Z) /Custom (cval) /NotAString 42 >>\nendobj\n") + xrefOff := off() + fmt.Fprint(&buf, "xref\n0 4\n") + fmt.Fprintf(&buf, "%010d %05d f \n", 0, 65535) + for i := 1; i <= 3; i++ { + fmt.Fprintf(&buf, "%010d %05d n \n", offsets[i], 0) + } + fmt.Fprint(&buf, "trailer\n<< /Size 4 /Root 1 0 R /Info 3 0 R >>\n") + fmt.Fprintf(&buf, "startxref\n%d\n%%%%EOF\n", xrefOff) + + r, err := Open(bytes.NewReader(buf.Bytes())) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer r.Close() + info := r.DocumentInfo() + if info.Author != "Ada" || info.Subject != "Math" || info.Keywords != "a,b" || + info.Creator != "X" || info.Producer != "Y" { + t.Errorf("string fields wrong: %+v", info) + } + if info.CreationDate.Year() != 2020 || info.ModDate.Year() != 2021 { + t.Errorf("dates wrong: created %v, mod %v", info.CreationDate, info.ModDate) + } + if info.Custom["Custom"] != "cval" { + t.Errorf("Custom[Custom] = %q, want cval", info.Custom["Custom"]) + } + if _, ok := info.Custom["NotAString"]; ok { + t.Error("non-string /NotAString should be skipped, not collected") + } +} + +func TestObjectsIteration(t *testing.T) { + r, err := Open(bytes.NewReader(buildMinimalPDF(t))) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer r.Close() + var nums []int + for e := range r.Objects() { + nums = append(nums, e.Reference.Number) + } + if len(nums) < 4 { + t.Fatalf("iterated %d objects, want >= 4", len(nums)) + } + for i := 1; i < len(nums); i++ { + if nums[i] < nums[i-1] { + t.Errorf("objects out of order: %d before %d", nums[i-1], nums[i]) + } + } + count := 0 + for range r.Objects() { + count++ + break + } + if count != 1 { + t.Fatalf("early break iterated %d, want 1", count) + } +} + +// A reference to an object absent from the xref table resolves to null per +// §7.3.10, not an error. +func TestResolveDanglingReference(t *testing.T) { + r, err := Open(bytes.NewReader(buildMinimalPDF(t))) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer r.Close() + v, err := r.Resolve(Reference{Number: 99}) + if err != nil { + t.Fatalf("dangling ref: %v", err) + } + if _, ok := v.(Null); !ok { + t.Errorf("dangling ref = %T, want Null", v) + } +} diff --git a/text_test.go b/text_test.go index 49cbdb5..6680ffd 100644 --- a/text_test.go +++ b/text_test.go @@ -95,3 +95,32 @@ func TestParseDate(t *testing.T) { t.Errorf("tz parse = %v, want %v", got, want) } } + +// looksLikeText drives the dump heuristic that decides whether a string is +// rendered inline or hex-escaped: BOMs and ASCII are text; control bytes are +// not; high bytes are text only if they decode cleanly under PDFDocEncoding +// (0x80 = bullet) and not when they hit an undefined slot (0x9F). +func TestLooksLikeText(t *testing.T) { + cases := []struct { + name string + in String + want bool + }{ + {"utf16be_bom", String{0xFE, 0xFF, 0, 'A'}, true}, + {"utf16le_bom", String{0xFF, 0xFE, 'A', 0}, true}, + {"utf8_bom", String{0xEF, 0xBB, 0xBF, 'h', 'i'}, true}, + {"ascii", String("Hello, World!"), true}, + {"ascii_with_whitespace", String("a\tb\r\nc"), true}, + {"control_byte", String{'a', 0x01, 'b'}, false}, + {"del_byte", String{0x7F}, false}, + {"pdfdoc_high_clean", String{0x80}, true}, + {"pdfdoc_high_undefined", String{0x9F}, false}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + if got := looksLikeText(c.in); got != c.want { + t.Errorf("looksLikeText(% x) = %v, want %v", c.in, got, c.want) + } + }) + } +} From de7f63d83da3c1dc4a52b5476c30e3123dc1fd31 Mon Sep 17 00:00:00 2001 From: Florian Kinder Date: Tue, 23 Jun 2026 12:01:15 +0200 Subject: [PATCH 2/2] Tighten two test comments that still restated the cases looksLikeText: drop the rule list the named cases already show, keep the non-obvious high-byte magic values. parseObjectFrom: drop the meta parenthetical. --- parse_test.go | 2 +- text_test.go | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/parse_test.go b/parse_test.go index 1aae9cd..ef7e3af 100644 --- a/parse_test.go +++ b/parse_test.go @@ -71,7 +71,7 @@ func TestModeratelyNestedArrayResolves(t *testing.T) { } } -// Malformed token streams must error, never panic (cases labelled inline). +// Malformed token streams must error, never panic. func TestParseObjectErrors(t *testing.T) { for _, src := range []string{ "", // EOF where an object is expected diff --git a/text_test.go b/text_test.go index 6680ffd..ae50d5a 100644 --- a/text_test.go +++ b/text_test.go @@ -96,10 +96,9 @@ func TestParseDate(t *testing.T) { } } -// looksLikeText drives the dump heuristic that decides whether a string is -// rendered inline or hex-escaped: BOMs and ASCII are text; control bytes are -// not; high bytes are text only if they decode cleanly under PDFDocEncoding -// (0x80 = bullet) and not when they hit an undefined slot (0x9F). +// The dump heuristic for rendering a string inline vs hex-escaped. The +// high-byte cases are the subtle ones: 0x80 is a clean PDFDocEncoding glyph +// (bullet), 0x9F an undefined slot. func TestLooksLikeText(t *testing.T) { cases := []struct { name string