diff --git a/xref.go b/xref.go index f7572ea..c9ce9da 100644 --- a/xref.go +++ b/xref.go @@ -444,7 +444,10 @@ func (r *Reader) readCompressedObject(objStmNum, idx int, expect Reference) (Obj num int offset int } - pairs := make([]pair, 0, n) + // /N is attacker-controlled and only loosely bounded by len(content); a huge + // value must not preallocate — append grows pairs to the real entry count. + const maxObjStmPrealloc = 1 << 12 + pairs := make([]pair, 0, min(int(n), maxObjStmPrealloc)) for i := int64(0); i < n; i++ { t1, err := lx.Next() if err != nil || t1.Kind != lex.Integer { diff --git a/xref_test.go b/xref_test.go index d9cb2ef..988bf0b 100644 --- a/xref_test.go +++ b/xref_test.go @@ -4,6 +4,7 @@ import ( "bytes" "compress/zlib" "fmt" + "runtime" "testing" ) @@ -214,6 +215,188 @@ func TestObjStmRejectsHostileHeader(t *testing.T) { } } +// buildObjStmHugeN builds a PDF reachable via an xref stream where object 4 +// is a type-2 entry inside ObjStm 3. The ObjStm decodes to contentLen zero +// bytes (cheap: FlateDecode compresses them to a few KB) but declares +// /N == contentLen. A reader that trusts /N as a slice capacity balloons the +// few-KB file into contentLen*sizeof(pair) bytes before parsing a single entry. +func buildObjStmHugeN(t *testing.T, contentLen int) []byte { + t.Helper() + var zbuf bytes.Buffer + zw := zlib.NewWriter(&zbuf) + if _, err := zw.Write(make([]byte, contentLen)); err != nil { + t.Fatal(err) + } + zw.Close() + compressed := zbuf.Bytes() + + var buf bytes.Buffer + buf.WriteString("%PDF-1.5\n%\xe2\xe3\xcf\xd3\n") + + off3 := buf.Len() + fmt.Fprintf(&buf, "3 0 obj\n<< /Type /ObjStm /N %d /First 4 /Filter /FlateDecode /Length %d >>\nstream\n", + contentLen, len(compressed)) + buf.Write(compressed) + buf.WriteString("\nendstream\nendobj\n") + + off2 := buf.Len() + rows := []byte{ + 0x01, byte(off3 >> 8), byte(off3), 0x00, // obj 3: type 1 @ off3 + 0x02, 0x00, 0x03, 0x00, // obj 4: type 2 -> ObjStm 3, index 0 + } + fmt.Fprintf(&buf, "2 0 obj\n<< /Type /XRef /W [ 1 2 1 ] /Index [ 3 2 ] /Size 5 /Root 1 0 R /Length %d >>\nstream\n", + len(rows)) + buf.Write(rows) + buf.WriteString("\nendstream\nendobj\n") + + fmt.Fprintf(&buf, "startxref\n%d\n%%%%EOF\n", off2) + return buf.Bytes() +} + +func TestObjStmHugeNDoesNotAmplifyAllocation(t *testing.T) { + const contentLen = 16 << 20 // == DefaultMaxStreamSize, the largest /N can be + data := buildObjStmHugeN(t, contentLen) + r, err := Open(bytes.NewReader(data)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer r.Close() + + var before, after runtime.MemStats + runtime.GC() + runtime.ReadMemStats(&before) + _, err = r.Resolve(Reference{Number: 4, Generation: 0}) + runtime.ReadMemStats(&after) + if err == nil { + t.Fatal("expected an error resolving the hostile ObjStm entry, got nil") + } + // Decoding contentLen zero bytes legitimately costs ~2*contentLen; the + // 256 MiB the /N prealloc would add is well past this ceiling. + const limit = 128 << 20 + if used := after.TotalAlloc - before.TotalAlloc; used > limit { + t.Fatalf("resolving a %d-byte ObjStm allocated %d bytes (> %d limit); /N is amplifying allocation", + contentLen, used, limit) + } +} + +// Capping the prealloc must not truncate a legitimate ObjStm whose entry count +// exceeds the cap: object 100+i carries the value 100+i, and resolving one past +// the cap must still return its exact value (proving append grew the slice). +func TestObjStmManyObjectsResolvePastPrealloc(t *testing.T) { + const m = 5000 // > the internal maxObjStmPrealloc (4096) + + var body bytes.Buffer + offsets := make([]int, m) + for i := 0; i < m; i++ { + offsets[i] = body.Len() + fmt.Fprintf(&body, "%d ", 100+i) + } + var head bytes.Buffer + for i := 0; i < m; i++ { + fmt.Fprintf(&head, "%d %d ", 100+i, offsets[i]) + } + content := head.String() + body.String() + + var buf bytes.Buffer + buf.WriteString("%PDF-1.5\n%\xe2\xe3\xcf\xd3\n") + off1 := buf.Len() + fmt.Fprintf(&buf, "1 0 obj\n<< /Type /ObjStm /N %d /First %d /Length %d >>\nstream\n", + m, head.Len(), len(content)) + buf.WriteString(content) + buf.WriteString("\nendstream\nendobj\n") + + last := 100 + m - 1 + off2 := buf.Len() + rows := []byte{ + 0x01, byte(off1 >> 8), byte(off1), 0x00, 0x00, // obj 1: type 1 @ off1 + 0x02, 0x00, 0x01, byte((m - 1) >> 8), byte((m - 1) & 0xff), // last obj: type 2 -> ObjStm 1, idx m-1 + } + fmt.Fprintf(&buf, "2 0 obj\n<< /Type /XRef /W [ 1 2 2 ] /Index [ 1 1 %d 1 ] /Size %d /Root 1 0 R /Length %d >>\nstream\n", + last, last+1, len(rows)) + buf.Write(rows) + buf.WriteString("\nendstream\nendobj\n") + fmt.Fprintf(&buf, "startxref\n%d\n%%%%EOF\n", off2) + + r, err := Open(bytes.NewReader(buf.Bytes())) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer r.Close() + v, err := r.Resolve(Reference{Number: last, Generation: 0}) + if err != nil { + t.Fatalf("Resolve object %d: %v", last, err) + } + n, ok := v.(Integer) + if !ok || int(n) != last { + t.Fatalf("object %d resolved to %v (%T), want Integer %d", last, v, v, last) + } +} + +// With no startxref and no "trailer" keyword, recovery must scan the rebuilt +// objects for a /Type /Catalog and synthesise a trailer pointing at it. +func TestRecoverXrefViaCatalogScan(t *testing.T) { + var buf bytes.Buffer + buf.WriteString("%PDF-1.7\n%\xe2\xe3\xcf\xd3\n") + buf.WriteString("1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n") + buf.WriteString("2 0 obj\n<< /Type /Pages /Count 0 /Kids [] >>\nendobj\n") + buf.WriteString("%%EOF\n") + + r, err := Open(bytes.NewReader(buf.Bytes())) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer r.Close() + cat, err := r.Catalog() + if err != nil { + t.Fatalf("Catalog: %v", err) + } + if n, ok := cat.Name("Type"); !ok || n != "Catalog" { + t.Fatalf("/Type %q ok=%v, want Catalog", n, ok) + } +} + +// An incremental update appends a second xref section whose /Prev points back +// at the first. The newer section must win: object 1 resolves to its updated +// body, and trailer keys present only in the older section still resolve. +func TestPrevChainNewestSectionWins(t *testing.T) { + var buf bytes.Buffer + w := func(s string) int { off := buf.Len(); buf.WriteString(s); return off } + w("%PDF-1.7\n%\xe2\xe3\xcf\xd3\n") + off1v1 := w("1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n") + off2 := w("2 0 obj\n<< /Type /Pages /Count 0 /Kids [] >>\nendobj\n") + + xref1 := buf.Len() + fmt.Fprintf(&buf, "xref\n0 3\n%010d %05d f \n%010d %05d n \n%010d %05d n \n", + 0, 65535, off1v1, 0, off2, 0) + buf.WriteString("trailer\n<< /Size 3 /Root 1 0 R /Info 2 0 R >>\n") + fmt.Fprintf(&buf, "startxref\n%d\n%%%%EOF\n", xref1) + + off1v2 := buf.Len() + buf.WriteString("1 0 obj\n<< /Type /Catalog /Pages 2 0 R /Lang (en-US) >>\nendobj\n") + xref2 := buf.Len() + fmt.Fprintf(&buf, "xref\n1 1\n%010d %05d n \n", off1v2, 0) + fmt.Fprintf(&buf, "trailer\n<< /Size 3 /Root 1 0 R /Prev %d >>\n", xref1) + fmt.Fprintf(&buf, "startxref\n%d\n%%%%EOF\n", xref2) + + r, err := Open(bytes.NewReader(buf.Bytes())) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer r.Close() + + cat, err := r.Catalog() + if err != nil { + t.Fatalf("Catalog: %v", err) + } + if lang, ok := cat.String("Lang"); !ok || lang != "en-US" { + t.Errorf("/Lang = %q ok=%v, want en-US (updated object 1 not used)", lang, ok) + } + // /Info lives only in the older trailer; the merge must preserve it. + if _, ok := r.Trailer().Get("Info"); !ok { + t.Error("older trailer's /Info lost after /Prev merge") + } +} + // A classical xref subsection declaring far more entries than the file holds // must be handled gracefully (recover), not over-read the buffer. The bound is // also overflow-safe on 32-bit by construction (untestable on a 64-bit run).