diff --git a/mindee/image/extracted_image.py b/mindee/image/extracted_image.py index 3ad6af8a..dffc2335 100644 --- a/mindee/image/extracted_image.py +++ b/mindee/image/extracted_image.py @@ -48,7 +48,7 @@ def __init__( self._element_id = 0 if element_id is None else element_id @requires_pillow - def save_to_file(self, output_path: Path | str): + def write_to_file(self, output_path: Path | str): """ Saves the document to a file. diff --git a/mindee/image/extracted_images.py b/mindee/image/extracted_images.py index cf55e57e..511b3301 100644 --- a/mindee/image/extracted_images.py +++ b/mindee/image/extracted_images.py @@ -9,4 +9,4 @@ class ExtractedImages(list[ExtractedImage]): def save_all_to_disk(self, output_path: Path | str) -> None: """Save all extracted images to disk.""" for image in self: - image.save_to_file(output_path) + image.write_to_file(output_path) diff --git a/mindee/pdf/extracted_pdf.py b/mindee/pdf/extracted_pdf.py index 7d1ec8e9..7c63af05 100644 --- a/mindee/pdf/extracted_pdf.py +++ b/mindee/pdf/extracted_pdf.py @@ -3,44 +3,27 @@ from pathlib import Path from typing import BinaryIO -from mindee.dependencies.checkers import PYPDFIUM2_AVAILABLE -from mindee.dependencies.decorators import requires_pypdfium2 from mindee.error.mindee_error import MindeeError from mindee.input.bytes_input import BytesInput -if PYPDFIUM2_AVAILABLE: - # pylint: disable=import-error - import pypdfium2 as pdfium -else: - pdfium = None # pylint: disable=invalid-name - class ExtractedPDF: """An extracted sub-Pdf.""" buffer: BinaryIO + """PDF content as a byte stream.""" filename: str - _page_indexes: tuple[int, int] + """Name of the file when writing to disk.""" + _page_indexes: list[int] def __init__( - self, pdf_byte_stream: BinaryIO, filename: str, page_indexes: tuple[int, int] + self, pdf_byte_stream: BinaryIO, filename: str, page_indexes: list[int] ): self.buffer = pdf_byte_stream self.filename = filename self._page_indexes = page_indexes - @requires_pypdfium2 - def get_page_count(self) -> int: - """Get the number of pages in the PDF file.""" - try: - pdf = pdfium.PdfDocument(self.buffer) - return len(pdf) - except Exception as e: - raise MindeeError( - "Could not retrieve page count from Extracted PDF object." - ) from e - - def save_to_file(self, output_path: Path | str): + def write_to_file(self, output_path: Path | str): """ Writes the contents of the current PDF object to a file. @@ -66,6 +49,13 @@ def as_input_source(self) -> BytesInput: return BytesInput(self.buffer.read(), self.filename) @property - def page_indexes(self) -> tuple[int, int]: - """This PDF was extracted from this page range of the original PDF.""" + def page_indexes(self) -> list[int]: + """ + 0-based indexes of all pages taken from the original PDF. + """ return self._page_indexes + + @property + def page_count(self) -> int: + """The number of pages in this PDF file.""" + return len(self._page_indexes) diff --git a/mindee/pdf/extracted_pdfs.py b/mindee/pdf/extracted_pdfs.py index 2701b627..cb87aafc 100644 --- a/mindee/pdf/extracted_pdfs.py +++ b/mindee/pdf/extracted_pdfs.py @@ -10,4 +10,4 @@ def save_all_to_disk(self, output_path: Path | str) -> None: """Save all extracted images to disk.""" for image in self: - image.save_to_file(output_path) + image.write_to_file(output_path) diff --git a/mindee/pdf/pdf_extractor.py b/mindee/pdf/pdf_extractor.py index 7ea91897..cf3fecdb 100644 --- a/mindee/pdf/pdf_extractor.py +++ b/mindee/pdf/pdf_extractor.py @@ -9,6 +9,7 @@ from mindee.error.mindee_error import MindeeError from mindee.input.local_input_source import LocalInputSource from mindee.pdf.extracted_pdf import ExtractedPDF +from mindee.pdf.extracted_pdfs import ExtractedPDFs if PYPDFIUM2_AVAILABLE: # pylint: disable=import-error @@ -28,10 +29,12 @@ class PDFExtractor: _source_pdf: BinaryIO _filename: str + _page_count: int @requires_pillow def __init__(self, local_input: LocalInputSource): self._filename = local_input.filename + self._page_count = local_input.page_count if local_input.is_pdf(): self._source_pdf = local_input.file_object else: @@ -40,66 +43,51 @@ def __init__(self, local_input: LocalInputSource): pdf_image.save(self._source_pdf, format="PDF") @requires_pypdfium2 - def get_page_count(self) -> int: - """Get the number of pages in the PDF file.""" - pdf = pdfium.PdfDocument(self._source_pdf) - return len(pdf) - - @requires_pypdfium2 - def cut_pages(self, page_indexes: list) -> BinaryIO: + def extract_single_document(self, page_indexes: list[int]) -> ExtractedPDF: """ Create a new PDF from pages and save it into a buffer. :param page_indexes: List of pages number to use for merging in the original PDF. :return: The buffer containing the new PDF. """ + if not page_indexes or len(page_indexes) == 0: + raise MindeeError("Empty indexes aren't allowed for extraction.") + for page_index in page_indexes: + if page_index > self._page_count: + raise MindeeError(f"Index {page_index} is out of range.") + self._source_pdf.seek(0) new_pdf = pdfium.PdfDocument.new() pdf = pdfium.PdfDocument(self._source_pdf) new_pdf.import_pages(pdf, page_indexes) bytes_io = io.BytesIO() new_pdf.save(bytes_io) - return bytes_io + + first_page = page_indexes[0] + last_page = page_indexes[len(page_indexes) - 1] + return ExtractedPDF( + pdf_byte_stream=bytes_io, + filename=self._make_filename(first_page, last_page), + page_indexes=page_indexes, + ) @requires_pypdfium2 - def extract_sub_documents( + def extract_multiple_documents( self, page_indexes: list[list[int]] - ) -> list[ExtractedPDF]: + ) -> ExtractedPDFs: """ Extract the sub-documents from the main pdf, based on the given list of page indexes. :param page_indexes: 2D list of numbers, representing page indexes. :return: A list of created PDFS. """ + if len(page_indexes) < 1: + raise MindeeError("No indexes provided.") extracted_pdfs: list[ExtractedPDF] = [] - extension = Path(self._filename).suffix - stem = Path(self._filename).stem for page_index_elem in page_indexes: - if not page_index_elem or len(page_index_elem) == 0: - raise MindeeError("Empty indexes aren't allowed for extraction.") - for page_index in page_index_elem: - if page_index > self.get_page_count(): - raise MindeeError(f"Index {page_index} is out of range.") - first_page = page_index_elem[0] - last_page = page_index_elem[len(page_index_elem) - 1] - extracted_pdf = ExtractedPDF( - self.cut_pages(page_index_elem), - f"{stem}_pages-{(first_page + 1):03d}-{(last_page + 1):03d}{extension}", - (first_page, last_page), - ) - extracted_pdfs.append(extracted_pdf) - return extracted_pdfs - - def extract_documents( - self, - page_indexes: list[list[int]], - ) -> list[ExtractedPDF]: - """ - Extracts complete PDFs from the document. + extracted_pdfs.append(self.extract_single_document(page_index_elem)) + return ExtractedPDFs(extracted_pdfs) - :param page_indexes: List of sub-lists of pages to keep. - :return: A list of extracted invoices. - """ - if len(page_indexes) < 1: - raise MindeeError("No indexes provided.") - return self.extract_sub_documents(page_indexes) + def _make_filename(self, first_page: int, last_page: int) -> str: + stem = Path(self._filename).stem + return f"{stem}_pages-{(first_page + 1):03d}-{(last_page + 1):03d}.pdf" diff --git a/mindee/v1/pdf/pdf_extractor.py b/mindee/v1/pdf/pdf_extractor.py index 782f9fd9..ed5895f5 100644 --- a/mindee/v1/pdf/pdf_extractor.py +++ b/mindee/v1/pdf/pdf_extractor.py @@ -24,11 +24,11 @@ def extract_invoices( if len(page_indexes) < 1: raise MindeeError("No indexes provided.") if not isinstance(page_indexes[0], InvoiceSplitterV1InvoicePageGroup): - return self.extract_sub_documents(page_indexes) # type: ignore + return self.extract_multiple_documents(page_indexes) # type: ignore if not strict: indexes_as_list = [page_index.page_indexes for page_index in page_indexes] # type: ignore - return self.extract_sub_documents(indexes_as_list) + return self.extract_multiple_documents(indexes_as_list) correct_page_indexes: list[list[int]] = [] current_list: list[int] = [] previous_confidence: float | None = None @@ -49,4 +49,4 @@ def extract_invoices( correct_page_indexes.append(current_list) correct_page_indexes.append(page_list) previous_confidence = confidence - return self.extract_sub_documents(correct_page_indexes) + return self.extract_multiple_documents(correct_page_indexes) diff --git a/mindee/v2/file_operations/split.py b/mindee/v2/file_operations/split.py index 8259b65f..e162e0e5 100644 --- a/mindee/v2/file_operations/split.py +++ b/mindee/v2/file_operations/split.py @@ -15,7 +15,8 @@ def extract_single_split( :param split: List of pages to keep. :return: Extracted PDF """ - return extract_multiple_splits(input_source, [split])[0] + pdf_extractor = PDFExtractor(input_source) + return pdf_extractor.extract_single_document(_range_to_indexes(split)) def extract_multiple_splits( @@ -32,7 +33,11 @@ def extract_multiple_splits( pdf_extractor = PDFExtractor(input_source) page_groups = [] for split in splits: - page_groups.append(list(range(split[0], split[1] + 1))) + page_groups.append(_range_to_indexes(split)) if len(splits) < 1: raise MindeeError("No indexes provided.") - return ExtractedPDFs(pdf_extractor.extract_sub_documents(page_groups)) + return pdf_extractor.extract_multiple_documents(page_groups) + + +def _range_to_indexes(split: list[int]) -> list[int]: + return list(range(split[0], split[1] + 1)) diff --git a/tests/v1/extraction/test_invoice_splitter_auto_extraction.py b/tests/v1/extraction/test_invoice_splitter_auto_extraction.py index 1a6be6de..1cea24c2 100644 --- a/tests/v1/extraction/test_invoice_splitter_auto_extraction.py +++ b/tests/v1/extraction/test_invoice_splitter_auto_extraction.py @@ -40,7 +40,7 @@ def test_pdf_should_extract_invoices_strict(): ) inference = response.document.inference pdf_extractor = PDFExtractor(invoice_splitter_input) - assert pdf_extractor.get_page_count() == 2 + assert invoice_splitter_input.page_count == 2 extracted_pdfs_not_strict = pdf_extractor.extract_invoices( inference.prediction.invoice_page_groups @@ -48,7 +48,7 @@ def test_pdf_should_extract_invoices_strict(): extracted_pdfs_strict = pdf_extractor.extract_invoices( inference.prediction.invoice_page_groups ) - extracted_base_pdfs = pdf_extractor.extract_documents( + extracted_base_pdfs = pdf_extractor.extract_multiple_documents( [int_list.page_indexes for int_list in inference.prediction.invoice_page_groups] ) for i, extracted_pdf in enumerate(extracted_base_pdfs): diff --git a/tests/v1/extraction/test_pdf_extractor.py b/tests/v1/extraction/test_pdf_extractor.py index 69f303b5..112c56ec 100644 --- a/tests/v1/extraction/test_pdf_extractor.py +++ b/tests/v1/extraction/test_pdf_extractor.py @@ -8,7 +8,7 @@ from mindee.v1.product.invoice_splitter.invoice_splitter_v1_document import ( InvoiceSplitterV1Document, ) -from tests.utils import V1_PRODUCT_DATA_DIR +from tests.utils import OUTPUT_DIR, V1_PRODUCT_DATA_DIR @pytest.fixture @@ -39,7 +39,12 @@ def test_image_should_extract_pdf(invoice_default_sample_path): jpg_input = PathInput(invoice_default_sample_path) assert not jpg_input.is_pdf() extractor = PDFExtractor(jpg_input) - assert extractor.get_page_count() == 1 + extracted_pdf = extractor.extract_single_document([0]) + assert extracted_pdf.page_count == 1 + assert extracted_pdf.page_indexes == [0] + assert extracted_pdf.filename == "default_sample_pages-001-001.pdf" + extracted_pdf.write_to_file(OUTPUT_DIR) + assert (OUTPUT_DIR / extracted_pdf.filename).exists() @pytest.mark.pillow @@ -48,20 +53,20 @@ def test_pdf_should_extract_invoices_no_strict( invoice_splitter_5p_path, loaded_prediction ): pdf_input = PathInput(invoice_splitter_5p_path) + assert pdf_input.page_count == 5 extractor = PDFExtractor(pdf_input) - assert extractor.get_page_count() == 5 extracted_pdfs_no_strict = extractor.extract_invoices( loaded_prediction.invoice_page_groups ) assert len(extracted_pdfs_no_strict) == 3 - assert extracted_pdfs_no_strict[0].get_page_count() == 1 + assert extracted_pdfs_no_strict[0].page_count == 1 assert extracted_pdfs_no_strict[0].filename == "invoice_5p_pages-001-001.pdf" - assert extracted_pdfs_no_strict[1].get_page_count() == 3 + assert extracted_pdfs_no_strict[1].page_count == 3 assert extracted_pdfs_no_strict[1].filename == "invoice_5p_pages-002-004.pdf" - assert extracted_pdfs_no_strict[2].get_page_count() == 1 + assert extracted_pdfs_no_strict[2].page_count == 1 assert extracted_pdfs_no_strict[2].filename == "invoice_5p_pages-005-005.pdf" @@ -71,15 +76,16 @@ def test_pdf_should_extract_invoices_strict( invoice_splitter_5p_path, loaded_prediction ): pdf_input = PathInput(invoice_splitter_5p_path) + assert pdf_input.page_count == 5 + extractor = PDFExtractor(pdf_input) - assert extractor.get_page_count() == 5 extracted_pdfs_strict = extractor.extract_invoices( loaded_prediction.invoice_page_groups, True ) assert len(extracted_pdfs_strict) == 2 - assert extracted_pdfs_strict[0].get_page_count() == 1 + assert extracted_pdfs_strict[0].page_count == 1 assert extracted_pdfs_strict[0].filename == "invoice_5p_pages-001-001.pdf" - assert extracted_pdfs_strict[1].get_page_count() == 4 + assert extracted_pdfs_strict[1].page_count == 4 assert extracted_pdfs_strict[1].filename == "invoice_5p_pages-002-005.pdf" diff --git a/tests/v2/file_operations/test_split_operation.py b/tests/v2/file_operations/test_split_operation.py index 31a3047f..3d754a4e 100644 --- a/tests/v2/file_operations/test_split_operation.py +++ b/tests/v2/file_operations/test_split_operation.py @@ -32,9 +32,9 @@ def test_default_split(): extracted_splits = response.inference.result.extract_from_input_source(input_sample) assert len(extracted_splits) == 2 - assert extracted_splits[0].get_page_count() == 1 + assert extracted_splits[0].page_count == 1 assert extracted_splits[0].filename == "default_sample_pages-001-001.pdf" - assert extracted_splits[1].get_page_count() == 1 + assert extracted_splits[1].page_count == 1 assert extracted_splits[1].filename == "default_sample_pages-002-002.pdf" @@ -46,11 +46,14 @@ def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path): extracted_splits = response.inference.result.extract_from_input_source(input_sample) assert len(extracted_splits) == 3 - assert extracted_splits[0].get_page_count() == 1 + assert extracted_splits[0].page_count == 1 + assert extracted_splits[0].page_indexes == [0] assert extracted_splits[0].filename == "invoice_5p_pages-001-001.pdf" - assert extracted_splits[1].get_page_count() == 3 + assert extracted_splits[1].page_count == 3 + assert extracted_splits[1].page_indexes == [1, 2, 3] assert extracted_splits[1].filename == "invoice_5p_pages-002-004.pdf" - assert extracted_splits[2].get_page_count() == 1 + assert extracted_splits[2].page_count == 1 + assert extracted_splits[2].page_indexes == [4] assert extracted_splits[2].filename == "invoice_5p_pages-005-005.pdf" @@ -62,4 +65,4 @@ def test_multi_page_receipt_single_split(splits_5p, splits_multi_page_json_path) split = response.inference.result.splits[1] extracted_split = split.extract_from_input_source(input_sample) - assert extracted_split.get_page_count() == 3 + assert extracted_split.page_count == 3 diff --git a/tests/v2/file_operations/test_split_operation_integration.py b/tests/v2/file_operations/test_split_operation_integration.py index b5719bc7..b7d8bc07 100644 --- a/tests/v2/file_operations/test_split_operation_integration.py +++ b/tests/v2/file_operations/test_split_operation_integration.py @@ -58,7 +58,7 @@ def test_pdf_should_extract_splits(): for i in range(len(extracted_splits)): local_input = PathInput(OUTPUT_DIR / output_files[i]) try: - assert local_input.page_count == extracted_splits[i].get_page_count() + assert local_input.page_count == extracted_splits[i].page_count finally: local_input.close() split_input.close()