Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mindee/image/extracted_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(
self._element_id = 0 if element_id is None else element_id

@requires_pillow
def save_to_file(self, output_path: Path | str):
def write_to_file(self, output_path: Path | str):
"""
Saves the document to a file.

Expand Down
2 changes: 1 addition & 1 deletion mindee/image/extracted_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ class ExtractedImages(list[ExtractedImage]):
def save_all_to_disk(self, output_path: Path | str) -> None:
"""Save all extracted images to disk."""
for image in self:
image.save_to_file(output_path)
image.write_to_file(output_path)
38 changes: 14 additions & 24 deletions mindee/pdf/extracted_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,44 +3,27 @@
from pathlib import Path
from typing import BinaryIO

from mindee.dependencies.checkers import PYPDFIUM2_AVAILABLE
from mindee.dependencies.decorators import requires_pypdfium2
from mindee.error.mindee_error import MindeeError
from mindee.input.bytes_input import BytesInput

if PYPDFIUM2_AVAILABLE:
# pylint: disable=import-error
import pypdfium2 as pdfium
else:
pdfium = None # pylint: disable=invalid-name


class ExtractedPDF:
"""An extracted sub-Pdf."""

buffer: BinaryIO
"""PDF content as a byte stream."""
filename: str
_page_indexes: tuple[int, int]
"""Name of the file when writing to disk."""
_page_indexes: list[int]

def __init__(
self, pdf_byte_stream: BinaryIO, filename: str, page_indexes: tuple[int, int]
self, pdf_byte_stream: BinaryIO, filename: str, page_indexes: list[int]
):
self.buffer = pdf_byte_stream
self.filename = filename
self._page_indexes = page_indexes

@requires_pypdfium2
def get_page_count(self) -> int:
"""Get the number of pages in the PDF file."""
try:
pdf = pdfium.PdfDocument(self.buffer)
return len(pdf)
except Exception as e:
raise MindeeError(
"Could not retrieve page count from Extracted PDF object."
) from e

def save_to_file(self, output_path: Path | str):
def write_to_file(self, output_path: Path | str):
"""
Writes the contents of the current PDF object to a file.

Expand All @@ -66,6 +49,13 @@ def as_input_source(self) -> BytesInput:
return BytesInput(self.buffer.read(), self.filename)

@property
def page_indexes(self) -> tuple[int, int]:
"""This PDF was extracted from this page range of the original PDF."""
def page_indexes(self) -> list[int]:
"""
0-based indexes of all pages taken from the original PDF.
"""
return self._page_indexes

@property
def page_count(self) -> int:
"""The number of pages in this PDF file."""
return len(self._page_indexes)
2 changes: 1 addition & 1 deletion mindee/pdf/extracted_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ def save_all_to_disk(self, output_path: Path | str) -> None:
"""Save all extracted images to disk."""

for image in self:
image.save_to_file(output_path)
image.write_to_file(output_path)
66 changes: 27 additions & 39 deletions mindee/pdf/pdf_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from mindee.error.mindee_error import MindeeError
from mindee.input.local_input_source import LocalInputSource
from mindee.pdf.extracted_pdf import ExtractedPDF
from mindee.pdf.extracted_pdfs import ExtractedPDFs

if PYPDFIUM2_AVAILABLE:
# pylint: disable=import-error
Expand All @@ -28,10 +29,12 @@ class PDFExtractor:

_source_pdf: BinaryIO
_filename: str
_page_count: int

@requires_pillow
def __init__(self, local_input: LocalInputSource):
self._filename = local_input.filename
self._page_count = local_input.page_count
if local_input.is_pdf():
self._source_pdf = local_input.file_object
else:
Expand All @@ -40,66 +43,51 @@ def __init__(self, local_input: LocalInputSource):
pdf_image.save(self._source_pdf, format="PDF")

@requires_pypdfium2
def get_page_count(self) -> int:
"""Get the number of pages in the PDF file."""
pdf = pdfium.PdfDocument(self._source_pdf)
return len(pdf)

@requires_pypdfium2
def cut_pages(self, page_indexes: list) -> BinaryIO:
def extract_single_document(self, page_indexes: list[int]) -> ExtractedPDF:
"""
Create a new PDF from pages and save it into a buffer.

:param page_indexes: List of pages number to use for merging in the original PDF.
:return: The buffer containing the new PDF.
"""
if not page_indexes or len(page_indexes) == 0:
raise MindeeError("Empty indexes aren't allowed for extraction.")
for page_index in page_indexes:
if page_index > self._page_count:
raise MindeeError(f"Index {page_index} is out of range.")

self._source_pdf.seek(0)
new_pdf = pdfium.PdfDocument.new()
pdf = pdfium.PdfDocument(self._source_pdf)
new_pdf.import_pages(pdf, page_indexes)
bytes_io = io.BytesIO()
new_pdf.save(bytes_io)
return bytes_io

first_page = page_indexes[0]
last_page = page_indexes[len(page_indexes) - 1]
return ExtractedPDF(
pdf_byte_stream=bytes_io,
filename=self._make_filename(first_page, last_page),
page_indexes=page_indexes,
)

@requires_pypdfium2
def extract_sub_documents(
def extract_multiple_documents(
self, page_indexes: list[list[int]]
) -> list[ExtractedPDF]:
) -> ExtractedPDFs:
"""
Extract the sub-documents from the main pdf, based on the given list of page indexes.

:param page_indexes: 2D list of numbers, representing page indexes.
:return: A list of created PDFS.
"""
if len(page_indexes) < 1:
raise MindeeError("No indexes provided.")
extracted_pdfs: list[ExtractedPDF] = []
extension = Path(self._filename).suffix
stem = Path(self._filename).stem
for page_index_elem in page_indexes:
if not page_index_elem or len(page_index_elem) == 0:
raise MindeeError("Empty indexes aren't allowed for extraction.")
for page_index in page_index_elem:
if page_index > self.get_page_count():
raise MindeeError(f"Index {page_index} is out of range.")
first_page = page_index_elem[0]
last_page = page_index_elem[len(page_index_elem) - 1]
extracted_pdf = ExtractedPDF(
self.cut_pages(page_index_elem),
f"{stem}_pages-{(first_page + 1):03d}-{(last_page + 1):03d}{extension}",
(first_page, last_page),
)
extracted_pdfs.append(extracted_pdf)
return extracted_pdfs

def extract_documents(
self,
page_indexes: list[list[int]],
) -> list[ExtractedPDF]:
"""
Extracts complete PDFs from the document.
extracted_pdfs.append(self.extract_single_document(page_index_elem))
return ExtractedPDFs(extracted_pdfs)

:param page_indexes: List of sub-lists of pages to keep.
:return: A list of extracted invoices.
"""
if len(page_indexes) < 1:
raise MindeeError("No indexes provided.")
return self.extract_sub_documents(page_indexes)
def _make_filename(self, first_page: int, last_page: int) -> str:
stem = Path(self._filename).stem
return f"{stem}_pages-{(first_page + 1):03d}-{(last_page + 1):03d}.pdf"
6 changes: 3 additions & 3 deletions mindee/v1/pdf/pdf_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ def extract_invoices(
if len(page_indexes) < 1:
raise MindeeError("No indexes provided.")
if not isinstance(page_indexes[0], InvoiceSplitterV1InvoicePageGroup):
return self.extract_sub_documents(page_indexes) # type: ignore
return self.extract_multiple_documents(page_indexes) # type: ignore

if not strict:
indexes_as_list = [page_index.page_indexes for page_index in page_indexes] # type: ignore
return self.extract_sub_documents(indexes_as_list)
return self.extract_multiple_documents(indexes_as_list)
correct_page_indexes: list[list[int]] = []
current_list: list[int] = []
previous_confidence: float | None = None
Expand All @@ -49,4 +49,4 @@ def extract_invoices(
correct_page_indexes.append(current_list)
correct_page_indexes.append(page_list)
previous_confidence = confidence
return self.extract_sub_documents(correct_page_indexes)
return self.extract_multiple_documents(correct_page_indexes)
11 changes: 8 additions & 3 deletions mindee/v2/file_operations/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ def extract_single_split(
:param split: List of pages to keep.
:return: Extracted PDF
"""
return extract_multiple_splits(input_source, [split])[0]
pdf_extractor = PDFExtractor(input_source)
return pdf_extractor.extract_single_document(_range_to_indexes(split))


def extract_multiple_splits(
Expand All @@ -32,7 +33,11 @@ def extract_multiple_splits(
pdf_extractor = PDFExtractor(input_source)
page_groups = []
for split in splits:
page_groups.append(list(range(split[0], split[1] + 1)))
page_groups.append(_range_to_indexes(split))
if len(splits) < 1:
raise MindeeError("No indexes provided.")
return ExtractedPDFs(pdf_extractor.extract_sub_documents(page_groups))
return pdf_extractor.extract_multiple_documents(page_groups)


def _range_to_indexes(split: list[int]) -> list[int]:
return list(range(split[0], split[1] + 1))
4 changes: 2 additions & 2 deletions tests/v1/extraction/test_invoice_splitter_auto_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@ def test_pdf_should_extract_invoices_strict():
)
inference = response.document.inference
pdf_extractor = PDFExtractor(invoice_splitter_input)
assert pdf_extractor.get_page_count() == 2
assert invoice_splitter_input.page_count == 2

extracted_pdfs_not_strict = pdf_extractor.extract_invoices(
inference.prediction.invoice_page_groups
)
extracted_pdfs_strict = pdf_extractor.extract_invoices(
inference.prediction.invoice_page_groups
)
extracted_base_pdfs = pdf_extractor.extract_documents(
extracted_base_pdfs = pdf_extractor.extract_multiple_documents(
[int_list.page_indexes for int_list in inference.prediction.invoice_page_groups]
)
for i, extracted_pdf in enumerate(extracted_base_pdfs):
Expand Down
24 changes: 15 additions & 9 deletions tests/v1/extraction/test_pdf_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from mindee.v1.product.invoice_splitter.invoice_splitter_v1_document import (
InvoiceSplitterV1Document,
)
from tests.utils import V1_PRODUCT_DATA_DIR
from tests.utils import OUTPUT_DIR, V1_PRODUCT_DATA_DIR


@pytest.fixture
Expand Down Expand Up @@ -39,7 +39,12 @@ def test_image_should_extract_pdf(invoice_default_sample_path):
jpg_input = PathInput(invoice_default_sample_path)
assert not jpg_input.is_pdf()
extractor = PDFExtractor(jpg_input)
assert extractor.get_page_count() == 1
extracted_pdf = extractor.extract_single_document([0])
assert extracted_pdf.page_count == 1
assert extracted_pdf.page_indexes == [0]
assert extracted_pdf.filename == "default_sample_pages-001-001.pdf"
extracted_pdf.write_to_file(OUTPUT_DIR)
assert (OUTPUT_DIR / extracted_pdf.filename).exists()


@pytest.mark.pillow
Expand All @@ -48,20 +53,20 @@ def test_pdf_should_extract_invoices_no_strict(
invoice_splitter_5p_path, loaded_prediction
):
pdf_input = PathInput(invoice_splitter_5p_path)
assert pdf_input.page_count == 5
extractor = PDFExtractor(pdf_input)
assert extractor.get_page_count() == 5
extracted_pdfs_no_strict = extractor.extract_invoices(
loaded_prediction.invoice_page_groups
)

assert len(extracted_pdfs_no_strict) == 3
assert extracted_pdfs_no_strict[0].get_page_count() == 1
assert extracted_pdfs_no_strict[0].page_count == 1
assert extracted_pdfs_no_strict[0].filename == "invoice_5p_pages-001-001.pdf"

assert extracted_pdfs_no_strict[1].get_page_count() == 3
assert extracted_pdfs_no_strict[1].page_count == 3
assert extracted_pdfs_no_strict[1].filename == "invoice_5p_pages-002-004.pdf"

assert extracted_pdfs_no_strict[2].get_page_count() == 1
assert extracted_pdfs_no_strict[2].page_count == 1
assert extracted_pdfs_no_strict[2].filename == "invoice_5p_pages-005-005.pdf"


Expand All @@ -71,15 +76,16 @@ def test_pdf_should_extract_invoices_strict(
invoice_splitter_5p_path, loaded_prediction
):
pdf_input = PathInput(invoice_splitter_5p_path)
assert pdf_input.page_count == 5

extractor = PDFExtractor(pdf_input)
assert extractor.get_page_count() == 5
extracted_pdfs_strict = extractor.extract_invoices(
loaded_prediction.invoice_page_groups, True
)

assert len(extracted_pdfs_strict) == 2
assert extracted_pdfs_strict[0].get_page_count() == 1
assert extracted_pdfs_strict[0].page_count == 1
assert extracted_pdfs_strict[0].filename == "invoice_5p_pages-001-001.pdf"

assert extracted_pdfs_strict[1].get_page_count() == 4
assert extracted_pdfs_strict[1].page_count == 4
assert extracted_pdfs_strict[1].filename == "invoice_5p_pages-002-005.pdf"
15 changes: 9 additions & 6 deletions tests/v2/file_operations/test_split_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ def test_default_split():
extracted_splits = response.inference.result.extract_from_input_source(input_sample)
assert len(extracted_splits) == 2

assert extracted_splits[0].get_page_count() == 1
assert extracted_splits[0].page_count == 1
assert extracted_splits[0].filename == "default_sample_pages-001-001.pdf"
assert extracted_splits[1].get_page_count() == 1
assert extracted_splits[1].page_count == 1
assert extracted_splits[1].filename == "default_sample_pages-002-002.pdf"


Expand All @@ -46,11 +46,14 @@ def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path):
extracted_splits = response.inference.result.extract_from_input_source(input_sample)
assert len(extracted_splits) == 3

assert extracted_splits[0].get_page_count() == 1
assert extracted_splits[0].page_count == 1
assert extracted_splits[0].page_indexes == [0]
assert extracted_splits[0].filename == "invoice_5p_pages-001-001.pdf"
assert extracted_splits[1].get_page_count() == 3
assert extracted_splits[1].page_count == 3
assert extracted_splits[1].page_indexes == [1, 2, 3]
assert extracted_splits[1].filename == "invoice_5p_pages-002-004.pdf"
assert extracted_splits[2].get_page_count() == 1
assert extracted_splits[2].page_count == 1
assert extracted_splits[2].page_indexes == [4]
assert extracted_splits[2].filename == "invoice_5p_pages-005-005.pdf"


Expand All @@ -62,4 +65,4 @@ def test_multi_page_receipt_single_split(splits_5p, splits_multi_page_json_path)
split = response.inference.result.splits[1]
extracted_split = split.extract_from_input_source(input_sample)

assert extracted_split.get_page_count() == 3
assert extracted_split.page_count == 3
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def test_pdf_should_extract_splits():
for i in range(len(extracted_splits)):
local_input = PathInput(OUTPUT_DIR / output_files[i])
try:
assert local_input.page_count == extracted_splits[i].get_page_count()
assert local_input.page_count == extracted_splits[i].page_count
finally:
local_input.close()
split_input.close()
Expand Down