From fc0d9d1d60530392ec9d767e63082de552a7532d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Fri, 26 Jun 2026 23:23:08 +0200 Subject: [PATCH 1/2] remove crop_files.rb --- lib/mindee/image.rb | 1 + lib/mindee/image/extracted_image.rb | 6 ++--- lib/mindee/image/extracted_images.rb | 20 +++++++++++++++ lib/mindee/image/image_extractor.rb | 9 ++++--- lib/mindee/v2/file_operations.rb | 1 - lib/mindee/v2/file_operations/crop.rb | 13 +++++----- lib/mindee/v2/file_operations/crop_files.rb | 25 ------------------- lib/mindee/v2/product/crop/crop_response.rb | 2 +- sig/mindee/image/extracted_images.rbs | 8 ++++++ sig/mindee/image/image_extractor.rbs | 4 +-- sig/mindee/v2/file_operation/crop.rbs | 2 +- sig/mindee/v2/file_operation/crop_files.rbs | 9 ------- sig/mindee/v2/product/crop/crop_response.rbs | 2 +- .../crop_operation_integration.rb | 8 +++--- .../v2/file_operations/crop_operation_spec.rb | 2 ++ 15 files changed, 55 insertions(+), 57 deletions(-) create mode 100644 lib/mindee/image/extracted_images.rb delete mode 100644 lib/mindee/v2/file_operations/crop_files.rb create mode 100644 sig/mindee/image/extracted_images.rbs delete mode 100644 sig/mindee/v2/file_operation/crop_files.rbs diff --git a/lib/mindee/image.rb b/lib/mindee/image.rb index 5664bc8f..34d90ec9 100644 --- a/lib/mindee/image.rb +++ b/lib/mindee/image.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true require_relative 'image/extracted_image' +require_relative 'image/extracted_images' require_relative 'image/image_compressor' require_relative 'image/image_extractor' require_relative 'image/image_utils' diff --git a/lib/mindee/image/extracted_image.rb b/lib/mindee/image/extracted_image.rb index afd3f586..ba8008fe 100644 --- a/lib/mindee/image/extracted_image.rb +++ b/lib/mindee/image/extracted_image.rb @@ -74,15 +74,15 @@ def write_to_file(output_path, file_format = nil) # # @return [FileInputSource] A BufferInput source. def as_source - @buffer.rewind - Mindee::Input::Source::BytesInputSource.new(@buffer.read || '', @filename) + as_input_source end # Return the file as a Mindee-compatible BufferInput source. # # @return [FileInputSource] A BufferInput source. def as_input_source - as_source + @buffer.rewind + Mindee::Input::Source::BytesInputSource.new(@buffer.read || '', @filename) end end end diff --git a/lib/mindee/image/extracted_images.rb b/lib/mindee/image/extracted_images.rb new file mode 100644 index 00000000..29384aac --- /dev/null +++ b/lib/mindee/image/extracted_images.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true + +require_relative 'extracted_image' + +module Mindee + # Image Extraction Module. + module Image + # List of extracted images. + class ExtractedImages < Array + # Save all extracted images to disk. + # + # @param output_path [String, Pathname] Directory path to save the extracted images to. + def save_all_to_disk(output_path) + each do |image| + image.write_to_file(File.join(output_path.to_s, image.filename)) + end + end + end + end +end diff --git a/lib/mindee/image/image_extractor.rb b/lib/mindee/image/image_extractor.rb index fa2b0190..d2105f27 100644 --- a/lib/mindee/image/image_extractor.rb +++ b/lib/mindee/image/image_extractor.rb @@ -7,6 +7,7 @@ require 'tempfile' require_relative '../input/sources' require_relative 'extracted_image' +require_relative 'extracted_images' module Mindee # Image Extraction Module. @@ -32,12 +33,12 @@ def self.attach_image_as_new_file(input_buffer, format: 'jpg') # @param [Input::Source::LocalInputSource] input_source # @param [Integer] page_id ID of the Page to extract from. # @param [Array>, Array] polygons List of coordinates to extract. - # @return [Array] Extracted Images. + # @return [Image::ExtractedImages] Extracted Images. def self.extract_multiple_images_from_source(input_source, page_id, polygons) new_stream = load_input_source_pdf_page_as_stringio(input_source, page_id) new_stream.seek(0) - extract_images_from_polygons(input_source, page_id, polygons) + ExtractedImages.new(extract_images_from_polygons(input_source, page_id, polygons)) end # Extracts images from their positions on a file (as polygons). @@ -45,9 +46,9 @@ def self.extract_multiple_images_from_source(input_source, page_id, polygons) # @param [Input::Source::LocalInputSource] input_source Local input source. # @param [Integer] page_id Page ID. # @param [Array] polygons - # @return [Array] Extracted Images. + # @return [Image::ExtractedImages] Extracted Images. def self.extract_images_from_polygons(input_source, page_id, polygons) - extracted_elements = [] # @type var extracted_elements: Array[Image::ExtractedImage] + extracted_elements = ExtractedImages.new # @type var extracted_elements: Image::ExtractedImages input_source.io_stream.rewind pdf_stream = StringIO.new(input_source.io_stream.read.to_s) diff --git a/lib/mindee/v2/file_operations.rb b/lib/mindee/v2/file_operations.rb index fe5a1549..592dffb4 100644 --- a/lib/mindee/v2/file_operations.rb +++ b/lib/mindee/v2/file_operations.rb @@ -1,6 +1,5 @@ # frozen_string_literal: true require_relative 'file_operations/crop' -require_relative 'file_operations/crop_files' require_relative 'file_operations/split' require_relative 'file_operations/split_files' diff --git a/lib/mindee/v2/file_operations/crop.rb b/lib/mindee/v2/file_operations/crop.rb index ac6ab9cf..21529861 100644 --- a/lib/mindee/v2/file_operations/crop.rb +++ b/lib/mindee/v2/file_operations/crop.rb @@ -21,7 +21,7 @@ def self.extract_single_crop(input_source, crop) # # @param input_source [LocalInputSource] Local Input Source to extract sub-receipts from. # @param crops [Array] List of crops. - # @return [CropFiles] Individual extracted zones as an array of ExtractedImage. + # @return [Image::ExtractedImages] Individual extracted zones as an array of ExtractedImage. # @raise [MindeeError] if the crops array is empty. def self.extract_crops(input_source, crops) if crops.nil? || crops.empty? @@ -35,15 +35,16 @@ def self.extract_crops(input_source, crops) polygons[crop.location.page] << crop.location.polygon end - images = [] # @type var images: Array[Image::ExtractedImage] + images = Mindee::Image::ExtractedImages.new polygons.each_with_index do |page_polygons, page_index| - extracted = Mindee::Image::ImageExtractor.extract_multiple_images_from_source( - input_source, page_index, page_polygons + images.concat( + Mindee::Image::ImageExtractor.extract_multiple_images_from_source( + input_source, page_index, page_polygons + ) ) - images.concat(extracted) end - CropFiles.new(images) + images end end end diff --git a/lib/mindee/v2/file_operations/crop_files.rb b/lib/mindee/v2/file_operations/crop_files.rb deleted file mode 100644 index c57db246..00000000 --- a/lib/mindee/v2/file_operations/crop_files.rb +++ /dev/null @@ -1,25 +0,0 @@ -# frozen_string_literal: true - -module Mindee - module V2 - module FileOperation - # Collection of cropped files. - class CropFiles < Array - # Save all extracted crops to disk. - # - # @param path [String, Pathname] Path to save the extracted crops to. - # @param prefix [String] Prefix to add to the filename, defaults to 'crop'. - # @param file_format [String, nil] File format to save the crops as, defaults to jpg if nil.] - def save_all_to_disk(path, prefix: 'crop', file_format: nil) - FileUtils.mkdir_p(path) - each.with_index(1) do |crop, idx| - filename = "#{prefix}_#{format('%03d', idx)}.jpg" - file_path = File.join(path.to_s, filename) - - crop.write_to_file(file_path, file_format) - end - end - end - end - end -end diff --git a/lib/mindee/v2/product/crop/crop_response.rb b/lib/mindee/v2/product/crop/crop_response.rb index c2ec8f4f..c862fefb 100644 --- a/lib/mindee/v2/product/crop/crop_response.rb +++ b/lib/mindee/v2/product/crop/crop_response.rb @@ -29,7 +29,7 @@ def to_s # Apply the crop inference to a file and return a list of extracted images. # # @param input_source [Mindee::Input::Source::LocalInputSource] Local file to extract from - # @return [FileOperation::CropFiles] List of extracted PDFs + # @return [Image::ExtractedImages] List of extracted images def extract_from_file(input_source) FileOperation::Crop.extract_crops(input_source, @inference.result.crops) end diff --git a/sig/mindee/image/extracted_images.rbs b/sig/mindee/image/extracted_images.rbs new file mode 100644 index 00000000..e379d856 --- /dev/null +++ b/sig/mindee/image/extracted_images.rbs @@ -0,0 +1,8 @@ +# lib/mindee/image/extracted_images.rb +module Mindee + module Image + class ExtractedImages < Array[ExtractedImage] + def save_all_to_disk: (String | Pathname) -> void + end + end +end diff --git a/sig/mindee/image/image_extractor.rbs b/sig/mindee/image/image_extractor.rbs index c873379f..8a88f9d6 100644 --- a/sig/mindee/image/image_extractor.rbs +++ b/sig/mindee/image/image_extractor.rbs @@ -4,8 +4,8 @@ module Mindee module ImageExtractor def self.attach_image_as_new_file: (StringIO | File, ?format: String) -> Origami::PDF def self.to_blob: () -> String - def self.extract_multiple_images_from_source: (Input::Source::LocalInputSource, Integer, Array[Array[Geometry::Point] |Geometry::Polygon | Geometry::Quadrilateral]) -> Array[ExtractedImage] - def self.extract_images_from_polygons: (Input::Source::LocalInputSource, Integer, Array[Array[Geometry::Point] | Geometry::Polygon | Geometry::Quadrilateral]) -> Array[ExtractedImage] + def self.extract_multiple_images_from_source: (Input::Source::LocalInputSource, Integer, Array[Array[Geometry::Point] |Geometry::Polygon | Geometry::Quadrilateral]) -> ExtractedImages + def self.extract_images_from_polygons: (Input::Source::LocalInputSource, Integer, Array[Array[Geometry::Point] | Geometry::Polygon | Geometry::Quadrilateral]) -> ExtractedImages def self.create_extracted_image: (StringIO | File, String, Integer, Integer) -> ExtractedImage def self.load_input_source_pdf_page_as_stringio: (Input::Source::LocalInputSource, Integer) -> (StringIO | File) end diff --git a/sig/mindee/v2/file_operation/crop.rbs b/sig/mindee/v2/file_operation/crop.rbs index f10349df..901b33a9 100644 --- a/sig/mindee/v2/file_operation/crop.rbs +++ b/sig/mindee/v2/file_operation/crop.rbs @@ -3,7 +3,7 @@ module Mindee module FileOperation module Crop def self.extract_single_crop: (Input::Source::LocalInputSource, Parsing::Field::FieldLocation) -> Image::ExtractedImage - def self.extract_crops: (Input::Source::LocalInputSource, Array[Product::Crop::CropItem]) -> CropFiles + def self.extract_crops: (Input::Source::LocalInputSource, Array[Product::Crop::CropItem]) -> Image::ExtractedImages end end end diff --git a/sig/mindee/v2/file_operation/crop_files.rbs b/sig/mindee/v2/file_operation/crop_files.rbs deleted file mode 100644 index 7994d94c..00000000 --- a/sig/mindee/v2/file_operation/crop_files.rbs +++ /dev/null @@ -1,9 +0,0 @@ -module Mindee - module V2 - module FileOperation - class CropFiles < Array[Image::ExtractedImage] - def save_all_to_disk: (String | Pathname, ?prefix: String, ?file_format: String?) -> void - end - end - end -end diff --git a/sig/mindee/v2/product/crop/crop_response.rbs b/sig/mindee/v2/product/crop/crop_response.rbs index 7ebadb1a..db6d970c 100644 --- a/sig/mindee/v2/product/crop/crop_response.rbs +++ b/sig/mindee/v2/product/crop/crop_response.rbs @@ -13,7 +13,7 @@ module Mindee def _params_type: -> singleton(Params::CropParameters) - def extract_from_file: (Input::Source::LocalInputSource) -> FileOperation::CropFiles + def extract_from_file: (Input::Source::LocalInputSource) -> Image::ExtractedImages def to_s: -> String def self._params_type: () -> singleton(Params::CropParameters) diff --git a/spec/v2/file_operations/crop_operation_integration.rb b/spec/v2/file_operations/crop_operation_integration.rb index ab3b9ca9..63ed1859 100644 --- a/spec/v2/file_operations/crop_operation_integration.rb +++ b/spec/v2/file_operations/crop_operation_integration.rb @@ -22,8 +22,8 @@ end after(:all) do - FileUtils.rm_f("#{OUTPUT_DIR}/crop_001.jpg") - FileUtils.rm_f("#{OUTPUT_DIR}/crop_002.jpg") + FileUtils.rm_f("#{OUTPUT_DIR}/default_sample.jpg_page0-0.jpg") + FileUtils.rm_f("#{OUTPUT_DIR}/default_sample.jpg_page0-1.jpg") end # Validates the parsed financial document response properties. @@ -65,7 +65,7 @@ def check_findoc_return(findoc_response) extracted_images.save_all_to_disk(OUTPUT_DIR) - expect(File.size(File.join(OUTPUT_DIR, 'crop_001.jpg'))).to be_between(560_000, 700_000) - expect(File.size(File.join(OUTPUT_DIR, 'crop_002.jpg'))).to be_between(580_000, 700_000) + expect(File.size(File.join(OUTPUT_DIR, 'default_sample.jpg_page0-0.jpg'))).to be_between(560_000, 700_000) + expect(File.size(File.join(OUTPUT_DIR, 'default_sample.jpg_page0-1.jpg'))).to be_between(580_000, 700_000) end end diff --git a/spec/v2/file_operations/crop_operation_spec.rb b/spec/v2/file_operations/crop_operation_spec.rb index f44f8a2a..ced2953e 100644 --- a/spec/v2/file_operations/crop_operation_spec.rb +++ b/spec/v2/file_operations/crop_operation_spec.rb @@ -30,6 +30,7 @@ extracted_crops = described_class.extract_crops(input_sample, doc.inference.result.crops) + expect(extracted_crops).to be_a(Mindee::Image::ExtractedImages) expect(extracted_crops.size).to eq(1) expect(extracted_crops[0].page_id).to eq(0) @@ -46,6 +47,7 @@ extracted_crops = described_class.extract_crops(input_sample, doc.inference.result.crops) + expect(extracted_crops).to be_a(Mindee::Image::ExtractedImages) expect(extracted_crops.size).to eq(2) expect(extracted_crops[0].page_id).to eq(0) From 9c1d7d7d9461f46d4b3b7903beb9dd98d68842b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Fri, 26 Jun 2026 23:34:10 +0200 Subject: [PATCH 2/2] remove split_files.rb --- lib/mindee/image/extracted_image.rb | 7 ------ lib/mindee/image/image_extractor.rb | 25 ++++++++++++++++--- lib/mindee/image/image_utils.rb | 17 ------------- lib/mindee/pdf.rb | 1 + lib/mindee/pdf/extracted_pdf.rb | 2 +- lib/mindee/pdf/extracted_pdfs.rb | 20 +++++++++++++++ lib/mindee/pdf/pdf_extractor.rb | 20 +++++++-------- lib/mindee/v2/file_operations.rb | 1 - lib/mindee/v2/file_operations/split.rb | 4 +-- lib/mindee/v2/file_operations/split_files.rb | 25 ------------------- lib/mindee/v2/product/split/split_response.rb | 2 +- sig/mindee/image/extracted_image.rbs | 1 - sig/mindee/image/image_extractor.rbs | 1 + sig/mindee/image/image_utils.rbs | 1 - sig/mindee/pdf/extracted_pdfs.rbs | 8 ++++++ sig/mindee/pdf/pdf_extractor.rbs | 4 +-- sig/mindee/v2/file_operation/split.rbs | 2 +- sig/mindee/v2/file_operation/split_files.rbs | 9 ------- .../v2/product/split/split_response.rbs | 2 +- spec/image/extracted_image_spec.rb | 4 +-- spec/image/image_extractor_spec.rb | 6 ++--- .../multi_receipts_extractor_spec.rb | 22 ++++++++-------- .../split_operation_integration.rb | 8 +++--- .../file_operations/split_operation_spec.rb | 2 ++ 24 files changed, 91 insertions(+), 103 deletions(-) create mode 100644 lib/mindee/pdf/extracted_pdfs.rb delete mode 100644 lib/mindee/v2/file_operations/split_files.rb create mode 100644 sig/mindee/pdf/extracted_pdfs.rbs delete mode 100644 sig/mindee/v2/file_operation/split_files.rbs diff --git a/lib/mindee/image/extracted_image.rb b/lib/mindee/image/extracted_image.rb index ba8008fe..24a11785 100644 --- a/lib/mindee/image/extracted_image.rb +++ b/lib/mindee/image/extracted_image.rb @@ -70,13 +70,6 @@ def write_to_file(output_path, file_format = nil) end end - # Return the file as a Mindee-compatible BufferInput source. - # - # @return [FileInputSource] A BufferInput source. - def as_source - as_input_source - end - # Return the file as a Mindee-compatible BufferInput source. # # @return [FileInputSource] A BufferInput source. diff --git a/lib/mindee/image/image_extractor.rb b/lib/mindee/image/image_extractor.rb index d2105f27..de37204d 100644 --- a/lib/mindee/image/image_extractor.rb +++ b/lib/mindee/image/image_extractor.rb @@ -14,10 +14,10 @@ module Mindee module Image # Image Extraction wrapper class. module ImageExtractor - # Attaches an image as a new page in a PdfDocument object. + # Attaches an image as a new page in a PDFDocument object. # # @param [StringIO] input_buffer Input buffer. Only supports JPEG. - # @return [Origami::PDF] A PdfDocument handle. + # @return [Origami::PDF] A PDFDocument handle. def self.attach_image_as_new_file(input_buffer, format: 'jpg') magick_image = MiniMagick::Image.read(input_buffer) # NOTE: We force format consolidation to a single format to avoid frames being interpreted as the final output. @@ -66,7 +66,7 @@ def self.extract_images_from_polygons(input_source, page_id, polygons) min_max_x = Geometry.get_min_max_x(points) min_max_y = Geometry.get_min_max_y(points) file_extension = ImageUtils.determine_file_extension(input_source) - cropped_image = ImageUtils.crop_image(page_content, min_max_x, min_max_y) + cropped_image = crop_image(page_content, min_max_x, min_max_y) if file_extension == 'pdf' cropped_image.format('jpg') else @@ -103,7 +103,7 @@ def self.create_extracted_image(buffer, file_name, page_id, element_id) # # @param input_file [LocalInputSource] Local input. # @param [Integer] page_id Page ID. - # @return [StringIO] A valid PdfDocument handle. + # @return [StringIO] A valid PDFDocument handle. def self.load_input_source_pdf_page_as_stringio(input_file, page_id) input_file.io_stream.rewind if input_file.pdf? @@ -112,6 +112,23 @@ def self.load_input_source_pdf_page_as_stringio(input_file, page_id) input_file.io_stream end end + + # Crops a MiniMagick Image from the given bounding box. + # + # @param [MiniMagick::Image] image Input Image. + # @param [Mindee::Geometry::MinMax] min_max_x minimum & maximum values for the x coordinates. + # @param [Mindee::Geometry::MinMax] min_max_y minimum & maximum values for the y coordinates. + def self.crop_image(image, min_max_x, min_max_y) + width = image[:width].to_i + height = image[:height].to_i + + image.format('jpg') + new_width = (min_max_x.max - min_max_x.min) * width + new_height = (min_max_y.max - min_max_y.min) * height + image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}") + + image + end end end end diff --git a/lib/mindee/image/image_utils.rb b/lib/mindee/image/image_utils.rb index a9f3bd69..9ceaf277 100644 --- a/lib/mindee/image/image_utils.rb +++ b/lib/mindee/image/image_utils.rb @@ -124,23 +124,6 @@ def self.read_page_content(pdf_stream) MiniMagick::Image.read(pdf_stream) end - # Crops a MiniMagick Image from a the given bounding box. - # - # @param [MiniMagick::Image] image Input Image. - # @param [Mindee::Geometry::MinMax] min_max_x minimum & maximum values for the x coordinates. - # @param [Mindee::Geometry::MinMax] min_max_y minimum & maximum values for the y coordinates. - def self.crop_image(image, min_max_x, min_max_y) - width = image[:width].to_i - height = image[:height].to_i - - image.format('jpg') - new_width = (min_max_x.max - min_max_x.min) * width - new_height = (min_max_y.max - min_max_y.min) * height - image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}") - - image - end - # Writes a MiniMagick::Image to a buffer. # # @param [MiniMagick::Image] image a valid MiniMagick image. diff --git a/lib/mindee/pdf.rb b/lib/mindee/pdf.rb index 48b05d99..8cbdd065 100644 --- a/lib/mindee/pdf.rb +++ b/lib/mindee/pdf.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true require_relative 'pdf/extracted_pdf' +require_relative 'pdf/extracted_pdfs' require_relative 'pdf/pdf_compressor' require_relative 'pdf/pdf_extractor' require_relative 'pdf/pdf_processor' diff --git a/lib/mindee/pdf/extracted_pdf.rb b/lib/mindee/pdf/extracted_pdf.rb index 7f2efb6f..6a4c7cda 100644 --- a/lib/mindee/pdf/extracted_pdf.rb +++ b/lib/mindee/pdf/extracted_pdf.rb @@ -3,7 +3,7 @@ module Mindee # PDF Extraction Module. module PDF - # An extracted sub-Pdf. + # An extracted sub-PDF. class ExtractedPDF # Byte contents of the pdf # @return [StringIO] diff --git a/lib/mindee/pdf/extracted_pdfs.rb b/lib/mindee/pdf/extracted_pdfs.rb new file mode 100644 index 00000000..ec5c0f1c --- /dev/null +++ b/lib/mindee/pdf/extracted_pdfs.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true + +require_relative 'extracted_pdf' + +module Mindee + # PDF Extraction Module. + module PDF + # List of extracted PDFs. + class ExtractedPDFs < Array + # Save all extracted PDFs to disk. + # + # @param output_path [String, Pathname] Directory path to save the extracted PDFs to. + def save_all_to_disk(output_path) + each do |pdf| + pdf.write_to_file(File.join(output_path.to_s, pdf.filename)) + end + end + end + end +end diff --git a/lib/mindee/pdf/pdf_extractor.rb b/lib/mindee/pdf/pdf_extractor.rb index a1150312..ae03a3cc 100644 --- a/lib/mindee/pdf/pdf_extractor.rb +++ b/lib/mindee/pdf/pdf_extractor.rb @@ -1,9 +1,9 @@ # frozen_string_literal: true module Mindee - # Pdf Extraction Module. + # PDF Extraction Module. module PDF - # Pdf extraction class. + # PDF extraction class. class PDFExtractor # @param local_input [Mindee::Input::Source::LocalInputSource] def initialize(local_input) @@ -23,15 +23,15 @@ def initialize(local_input) end end - # Retrieves the page count for the Pdf object. + # Retrieves the page count for the PDF object. # @return [Integer] def page_count Mindee::PDF::PDFProcessor.open_pdf(@source_pdf).pages.size end - # Creates a new Pdf from pages and save it into a buffer. - # @param page_indexes [Array] List of page number to use for merging in the original Pdf. - # @return [StringIO] The buffer containing the new Pdf. + # Creates a new PDF from pages and save it into a buffer. + # @param page_indexes [Array] List of page number to use for merging in the original PDF. + # @return [StringIO] The buffer containing the new PDF. def cut_pages(page_indexes) options = PageOptions.new(params: { page_indexes: page_indexes, @@ -41,10 +41,10 @@ def cut_pages(page_indexes) end # Extract the sub-documents from the main pdf, based on the given list of page indexes. - # @param page_indexes [Array>] List of page number to use for merging in the original Pdf. - # @return [Array] The buffer containing the new Pdf. + # @param page_indexes [Array>] List of page number to use for merging in the original PDF. + # @return [Mindee::PDF::ExtractedPDFs] The buffer containing the new PDF. def extract_sub_documents(page_indexes) - extracted_pdfs = [] # @type var extracted_pdfs: Array[Mindee::PDF::ExtractedPDF] + extracted_pdfs = ExtractedPDFs.new # @type var extracted_pdfs: Mindee::PDF::ExtractedPDFs extension = File.extname(@filename) basename = File.basename(@filename, extension) page_indexes.each do |page_index_list| @@ -74,7 +74,7 @@ def extract_sub_documents(page_indexes) # Extracts invoices as complete PDFs from the document. # @param page_indexes [Array, InvoiceSplitterV1InvoicePageGroup>] # @param strict [bool] - # @return [Array] + # @return [Mindee::PDF::ExtractedPDFs] def extract_invoices(page_indexes, strict: false) raise Error::MindeePDFError, 'No indexes provided.' if page_indexes.empty? diff --git a/lib/mindee/v2/file_operations.rb b/lib/mindee/v2/file_operations.rb index 592dffb4..91f87057 100644 --- a/lib/mindee/v2/file_operations.rb +++ b/lib/mindee/v2/file_operations.rb @@ -2,4 +2,3 @@ require_relative 'file_operations/crop' require_relative 'file_operations/split' -require_relative 'file_operations/split_files' diff --git a/lib/mindee/v2/file_operations/split.rb b/lib/mindee/v2/file_operations/split.rb index 60dea7fc..b2d9d1a1 100644 --- a/lib/mindee/v2/file_operations/split.rb +++ b/lib/mindee/v2/file_operations/split.rb @@ -18,7 +18,7 @@ def self.extract_single_split(input_source, split) # # @param input_source [LocalInputSource] Input source to split. # @param splits [Array>] List of sub-lists of pages to keep. - # @return [SplitFiles] A list of extracted invoices. + # @return [PDF::ExtractedPDFs] A list of extracted invoices. # @raise [MindeeError] if no indexes are provided. def self.extract_splits(input_source, splits) raise Mindee::Error::MindeeError, 'No indexes provided.' if splits.nil? || splits.empty? @@ -29,7 +29,7 @@ def self.extract_splits(input_source, splits) (split[0]..split[1]).to_a end - SplitFiles.new(pdf_extractor.extract_sub_documents(page_groups)) + pdf_extractor.extract_sub_documents(page_groups) end end end diff --git a/lib/mindee/v2/file_operations/split_files.rb b/lib/mindee/v2/file_operations/split_files.rb deleted file mode 100644 index d75b3f46..00000000 --- a/lib/mindee/v2/file_operations/split_files.rb +++ /dev/null @@ -1,25 +0,0 @@ -# frozen_string_literal: true - -module Mindee - module V2 - module FileOperation - # Collection of split files. - class SplitFiles < Array - # Save all extracted splits to disk. - # - # @param path [String, Pathname] Path to save the extracted splits to. - # @param prefix [String] Prefix to add to the filename, defaults to 'split'. - def save_all_to_disk(path, prefix: 'split') - FileUtils.mkdir_p(path) - - each.with_index(1) do |split, idx| - filename = "#{prefix}_#{format('%03d', idx)}.pdf" - file_path = File.join(path.to_s, filename) - - split.write_to_file(file_path) - end - end - end - end - end -end diff --git a/lib/mindee/v2/product/split/split_response.rb b/lib/mindee/v2/product/split/split_response.rb index b269fd7d..15436864 100644 --- a/lib/mindee/v2/product/split/split_response.rb +++ b/lib/mindee/v2/product/split/split_response.rb @@ -28,7 +28,7 @@ def to_s # Splits the input PDF. # @param input_source [Mindee::Input::Source::LocalInputSource] Path to the file or a File object. - # @return [FileOperation::SplitFiles] + # @return [PDF::ExtractedPDFs] def extract_from_file(input_source) splits = @inference.result.splits.map(&:page_range) FileOperation::Split.extract_splits(input_source, splits) diff --git a/sig/mindee/image/extracted_image.rbs b/sig/mindee/image/extracted_image.rbs index f60abe12..72c6b314 100644 --- a/sig/mindee/image/extracted_image.rbs +++ b/sig/mindee/image/extracted_image.rbs @@ -14,7 +14,6 @@ module Mindee ?preserve_input_filename: bool ) -> Integer def write_to_file: (String, ?String?) -> void - def as_source: -> Input::Source::BytesInputSource def as_input_source: -> Input::Source::BytesInputSource end end diff --git a/sig/mindee/image/image_extractor.rbs b/sig/mindee/image/image_extractor.rbs index 8a88f9d6..70a2f6d2 100644 --- a/sig/mindee/image/image_extractor.rbs +++ b/sig/mindee/image/image_extractor.rbs @@ -7,6 +7,7 @@ module Mindee def self.extract_multiple_images_from_source: (Input::Source::LocalInputSource, Integer, Array[Array[Geometry::Point] |Geometry::Polygon | Geometry::Quadrilateral]) -> ExtractedImages def self.extract_images_from_polygons: (Input::Source::LocalInputSource, Integer, Array[Array[Geometry::Point] | Geometry::Polygon | Geometry::Quadrilateral]) -> ExtractedImages def self.create_extracted_image: (StringIO | File, String, Integer, Integer) -> ExtractedImage + def self.crop_image: (MiniMagick::Image, Geometry::MinMax, Geometry::MinMax) -> (MiniMagick::Image) def self.load_input_source_pdf_page_as_stringio: (Input::Source::LocalInputSource, Integer) -> (StringIO | File) end end diff --git a/sig/mindee/image/image_utils.rbs b/sig/mindee/image/image_utils.rbs index d4d853d5..f31d86fb 100644 --- a/sig/mindee/image/image_utils.rbs +++ b/sig/mindee/image/image_utils.rbs @@ -11,7 +11,6 @@ module Mindee def self.pdf_to_magick_image: (StringIO | File, Integer) -> MiniMagick::Image def self.normalize_polygon: (Geometry::Quadrilateral | Geometry::Polygon | Array[Geometry::Point]) -> Geometry::Quadrilateral def self.read_page_content: (StringIO | File) -> (MiniMagick::Image) - def self.crop_image: (MiniMagick::Image, Geometry::MinMax, Geometry::MinMax) -> (MiniMagick::Image) def self.write_image_to_buffer: (MiniMagick::Image, StringIO) -> void def self.determine_file_extension: (Input::Source::LocalInputSource) -> String? end diff --git a/sig/mindee/pdf/extracted_pdfs.rbs b/sig/mindee/pdf/extracted_pdfs.rbs new file mode 100644 index 00000000..92d4921e --- /dev/null +++ b/sig/mindee/pdf/extracted_pdfs.rbs @@ -0,0 +1,8 @@ +# lib/mindee/pdf/extracted_pdfs.rb +module Mindee + module PDF + class ExtractedPDFs < Array[ExtractedPDF] + def save_all_to_disk: (String | Pathname) -> void + end + end +end diff --git a/sig/mindee/pdf/pdf_extractor.rbs b/sig/mindee/pdf/pdf_extractor.rbs index 553d5396..c698c6f1 100644 --- a/sig/mindee/pdf/pdf_extractor.rbs +++ b/sig/mindee/pdf/pdf_extractor.rbs @@ -11,9 +11,9 @@ module Mindee def cut_pages: (Array[Integer]) -> StringIO - def extract_sub_documents: (Array[Array[Integer]]) -> Array[ExtractedPDF] + def extract_sub_documents: (Array[Array[Integer]]) -> ExtractedPDFs - def extract_invoices: (Array[Mindee::V1::Product::InvoiceSplitter::InvoiceSplitterV1InvoicePageGroup] | Array[Array[Integer]], ?strict: bool) -> Array[ExtractedPDF] + def extract_invoices: (Array[Mindee::V1::Product::InvoiceSplitter::InvoiceSplitterV1InvoicePageGroup] | Array[Array[Integer]], ?strict: bool) -> ExtractedPDFs end end end diff --git a/sig/mindee/v2/file_operation/split.rbs b/sig/mindee/v2/file_operation/split.rbs index 5424c987..a7bbc764 100644 --- a/sig/mindee/v2/file_operation/split.rbs +++ b/sig/mindee/v2/file_operation/split.rbs @@ -4,7 +4,7 @@ module Mindee module Split def self.extract_single_split: (Input::Source::LocalInputSource, Array[Integer]) -> PDF::ExtractedPDF - def self.extract_splits: (Input::Source::LocalInputSource, Array[Array[Integer]]) -> SplitFiles + def self.extract_splits: (Input::Source::LocalInputSource, Array[Array[Integer]]) -> PDF::ExtractedPDFs end end end diff --git a/sig/mindee/v2/file_operation/split_files.rbs b/sig/mindee/v2/file_operation/split_files.rbs deleted file mode 100644 index 38a86457..00000000 --- a/sig/mindee/v2/file_operation/split_files.rbs +++ /dev/null @@ -1,9 +0,0 @@ -module Mindee - module V2 - module FileOperation - class SplitFiles < Array[PDF::ExtractedPDF] - def save_all_to_disk: (String | Pathname, ?prefix: String?) -> void - end - end - end -end diff --git a/sig/mindee/v2/product/split/split_response.rbs b/sig/mindee/v2/product/split/split_response.rbs index bd04c969..7b702f3e 100644 --- a/sig/mindee/v2/product/split/split_response.rbs +++ b/sig/mindee/v2/product/split/split_response.rbs @@ -13,7 +13,7 @@ module Mindee def _params_type: -> singleton(Params::SplitParameters) - def extract_from_file: (Mindee::Input::Source::LocalInputSource) -> FileOperation::SplitFiles + def extract_from_file: (Mindee::Input::Source::LocalInputSource) -> PDF::ExtractedPDFs def to_s: -> String def self._params_type: () -> singleton(Params::SplitParameters) diff --git a/spec/image/extracted_image_spec.rb b/spec/image/extracted_image_spec.rb index 3662ed82..4c7c82cf 100644 --- a/spec/image/extracted_image_spec.rb +++ b/spec/image/extracted_image_spec.rb @@ -87,11 +87,11 @@ end end - describe '#as_source' do + describe '#as_input_source' do it 'returns a BytesInputSource with the correct content and filename' do extracted_image = described_class.new(input_source, page_id, element_id) - source = extracted_image.as_source + source = extracted_image.as_input_source expect(source).to be_a(Mindee::Input::Source::BytesInputSource) expect(source.filename).to eq('default_sample_p1_42.jpg') diff --git a/spec/image/image_extractor_spec.rb b/spec/image/image_extractor_spec.rb index a22030a6..93e44aa9 100644 --- a/spec/image/image_extractor_spec.rb +++ b/spec/image/image_extractor_spec.rb @@ -31,7 +31,7 @@ expect(extracted_barcodes_1d.size).to eq(1) expect(extracted_barcodes_2d.size).to eq(2) - expect(extracted_barcodes_1d[0].as_source.filename).to end_with('jpg') + expect(extracted_barcodes_1d[0].as_input_source.filename).to end_with('jpg') extracted_barcodes_1d[0].buffer.rewind image_buffer1 = MiniMagick::Image.read(extracted_barcodes_1d[0].buffer) expect(image_buffer1.dimensions).to eq([353, 199]) @@ -39,12 +39,12 @@ extracted_barcodes_2d[0].buffer.rewind image_buffer2 = MiniMagick::Image.read(extracted_barcodes_2d[0].buffer) expect(image_buffer2.dimensions).to eq([214, 216]) - expect(extracted_barcodes_2d[0].as_source.filename).to end_with('jpg') + expect(extracted_barcodes_2d[0].as_input_source.filename).to end_with('jpg') extracted_barcodes_2d[0].buffer.rewind image_buffer3 = MiniMagick::Image.read(extracted_barcodes_2d[1].buffer) expect(image_buffer3.dimensions).to eq([193, 201]) - expect(extracted_barcodes_2d[1].as_source.filename).to end_with('jpg') + expect(extracted_barcodes_2d[1].as_input_source.filename).to end_with('jpg') end end end diff --git a/spec/v1/extraction/multi_receipts_extractor_spec.rb b/spec/v1/extraction/multi_receipts_extractor_spec.rb index 0c5337fc..d86a31b4 100644 --- a/spec/v1/extraction/multi_receipts_extractor_spec.rb +++ b/spec/v1/extraction/multi_receipts_extractor_spec.rb @@ -47,37 +47,37 @@ image_buffer0 = MiniMagick::Image.read(extracted_receipts[0].buffer) # NOTE: this varies from other SDKs due to different image processing. expect(image_buffer0.dimensions).to eq([341, 504]) - expect(extracted_receipts[0].as_source.filename).to end_with('jpg') + expect(extracted_receipts[0].as_input_source.filename).to end_with('jpg') expect(extracted_receipts[1].page_id).to eq(1) expect(extracted_receipts[1].element_id).to eq(1) image_buffer1 = MiniMagick::Image.read(extracted_receipts[1].buffer) expect(image_buffer1.dimensions).to eq([461, 908]) - expect(extracted_receipts[1].as_source.filename).to end_with('jpg') + expect(extracted_receipts[1].as_input_source.filename).to end_with('jpg') expect(extracted_receipts[2].page_id).to eq(1) expect(extracted_receipts[2].element_id).to eq(2) image_buffer2 = MiniMagick::Image.read(extracted_receipts[2].buffer) expect(image_buffer2.dimensions).to eq([472, 790]) - expect(extracted_receipts[2].as_source.filename).to end_with('jpg') + expect(extracted_receipts[2].as_input_source.filename).to end_with('jpg') expect(extracted_receipts[3].page_id).to eq(1) expect(extracted_receipts[3].element_id).to eq(3) image_buffer3 = MiniMagick::Image.read(extracted_receipts[3].buffer) expect(image_buffer3.dimensions).to eq([464, 1200]) - expect(extracted_receipts[3].as_source.filename).to end_with('jpg') + expect(extracted_receipts[3].as_input_source.filename).to end_with('jpg') expect(extracted_receipts[4].page_id).to eq(1) expect(extracted_receipts[4].element_id).to eq(4) image_buffer4 = MiniMagick::Image.read(extracted_receipts[4].buffer) expect(image_buffer4.dimensions).to eq([530, 944]) - expect(extracted_receipts[4].as_source.filename).to end_with('jpg') + expect(extracted_receipts[4].as_input_source.filename).to end_with('jpg') expect(extracted_receipts[5].page_id).to eq(1) expect(extracted_receipts[5].element_id).to eq(5) image_buffer5 = MiniMagick::Image.read(extracted_receipts[5].buffer) expect(image_buffer5.dimensions).to eq([366, 593]) - expect(extracted_receipts[5].as_source.filename).to end_with('jpg') + expect(extracted_receipts[5].as_input_source.filename).to end_with('jpg') end end @@ -94,31 +94,31 @@ expect(extracted_receipts[0].element_id).to eq(0) image_buffer0 = MiniMagick::Image.read(extracted_receipts[0].buffer) expect(image_buffer0.dimensions).to eq([198, 566]) - expect(extracted_receipts[0].as_source.filename).to end_with('jpg') + expect(extracted_receipts[0].as_input_source.filename).to end_with('jpg') expect(extracted_receipts[1].page_id).to eq(1) expect(extracted_receipts[1].element_id).to eq(1) image_buffer1 = MiniMagick::Image.read(extracted_receipts[1].buffer) expect(image_buffer1.dimensions).to eq([205, 382]) - expect(extracted_receipts[1].as_source.filename).to end_with('jpg') + expect(extracted_receipts[1].as_input_source.filename).to end_with('jpg') expect(extracted_receipts[2].page_id).to eq(1) expect(extracted_receipts[2].element_id).to eq(2) image_buffer2 = MiniMagick::Image.read(extracted_receipts[2].buffer) expect(image_buffer2.dimensions).to eq([195, 232]) - expect(extracted_receipts[2].as_source.filename).to end_with('jpg') + expect(extracted_receipts[2].as_input_source.filename).to end_with('jpg') expect(extracted_receipts[3].page_id).to eq(2) expect(extracted_receipts[3].element_id).to eq(0) image_buffer3 = MiniMagick::Image.read(extracted_receipts[3].buffer) expect(image_buffer3.dimensions).to eq([213, 355]) - expect(extracted_receipts[3].as_source.filename).to end_with('jpg') + expect(extracted_receipts[3].as_input_source.filename).to end_with('jpg') expect(extracted_receipts[4].page_id).to eq(2) expect(extracted_receipts[4].element_id).to eq(1) image_buffer4 = MiniMagick::Image.read(extracted_receipts[4].buffer) expect(image_buffer4.dimensions).to eq([212, 516]) - expect(extracted_receipts[4].as_source.filename).to end_with('jpg') + expect(extracted_receipts[4].as_input_source.filename).to end_with('jpg') end end diff --git a/spec/v2/file_operations/split_operation_integration.rb b/spec/v2/file_operations/split_operation_integration.rb index 8e216354..2b0163ed 100644 --- a/spec/v2/file_operations/split_operation_integration.rb +++ b/spec/v2/file_operations/split_operation_integration.rb @@ -27,8 +27,8 @@ end after(:all) do - FileUtils.rm_f("#{OUTPUT_DIR}/split_001.pdf") - FileUtils.rm_f("#{OUTPUT_DIR}/split_002.pdf") + FileUtils.rm_f("#{OUTPUT_DIR}/default_sample_001-001.pdf") + FileUtils.rm_f("#{OUTPUT_DIR}/default_sample_002-002.pdf") end # Validates the parsed financial document response properties. @@ -70,8 +70,8 @@ def check_findoc_return(findoc_response) extracted_pdfs.save_all_to_disk(OUTPUT_DIR) - extracted_pdfs.each_with_index do |pdf, i| - local_input = Mindee::Input::Source::PathInputSource.new(File.join(OUTPUT_DIR, format('split_%03d.pdf', i + 1))) + extracted_pdfs.each do |pdf| + local_input = Mindee::Input::Source::PathInputSource.new(File.join(OUTPUT_DIR, pdf.filename)) begin expect(local_input.page_count).to eq(pdf.page_count) ensure diff --git a/spec/v2/file_operations/split_operation_spec.rb b/spec/v2/file_operations/split_operation_spec.rb index c0d87fb4..49b230a7 100644 --- a/spec/v2/file_operations/split_operation_spec.rb +++ b/spec/v2/file_operations/split_operation_spec.rb @@ -28,6 +28,7 @@ extracted_splits = doc.extract_from_file(input_sample) + expect(extracted_splits).to be_a(Mindee::PDF::ExtractedPDFs) expect(extracted_splits.size).to eq(1) expect(extracted_splits[0].page_count).to eq(1) @@ -40,6 +41,7 @@ extracted_splits = doc.extract_from_file(input_sample) + expect(extracted_splits).to be_a(Mindee::PDF::ExtractedPDFs) expect(extracted_splits.size).to eq(3) expect(extracted_splits[0].page_count).to eq(1)