Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lib/mindee/image.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# frozen_string_literal: true

require_relative 'image/extracted_image'
require_relative 'image/extracted_images'
require_relative 'image/image_compressor'
require_relative 'image/image_extractor'
require_relative 'image/image_utils'
9 changes: 1 addition & 8 deletions lib/mindee/image/extracted_image.rb
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,10 @@ def write_to_file(output_path, file_format = nil)
# Return the file as a Mindee-compatible BufferInput source.
#
# @return [FileInputSource] A BufferInput source.
def as_source
def as_input_source
@buffer.rewind
Mindee::Input::Source::BytesInputSource.new(@buffer.read || '', @filename)
end

# Return the file as a Mindee-compatible BufferInput source.
#
# @return [FileInputSource] A BufferInput source.
def as_input_source
as_source
end
end
end
end
20 changes: 20 additions & 0 deletions lib/mindee/image/extracted_images.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# frozen_string_literal: true

require_relative 'extracted_image'

module Mindee
# Image Extraction Module.
module Image
# List of extracted images.
class ExtractedImages < Array
# Save all extracted images to disk.
#
# @param output_path [String, Pathname] Directory path to save the extracted images to.
def save_all_to_disk(output_path)
each do |image|
image.write_to_file(File.join(output_path.to_s, image.filename))
end
end
end
end
end
34 changes: 26 additions & 8 deletions lib/mindee/image/image_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,17 @@
require 'tempfile'
require_relative '../input/sources'
require_relative 'extracted_image'
require_relative 'extracted_images'

module Mindee
# Image Extraction Module.
module Image
# Image Extraction wrapper class.
module ImageExtractor
# Attaches an image as a new page in a PdfDocument object.
# Attaches an image as a new page in a PDFDocument object.
#
# @param [StringIO] input_buffer Input buffer. Only supports JPEG.
# @return [Origami::PDF] A PdfDocument handle.
# @return [Origami::PDF] A PDFDocument handle.
def self.attach_image_as_new_file(input_buffer, format: 'jpg')
magick_image = MiniMagick::Image.read(input_buffer)
# NOTE: We force format consolidation to a single format to avoid frames being interpreted as the final output.
Expand All @@ -32,22 +33,22 @@ def self.attach_image_as_new_file(input_buffer, format: 'jpg')
# @param [Input::Source::LocalInputSource] input_source
# @param [Integer] page_id ID of the Page to extract from.
# @param [Array<Array<Geometry::Point>>, Array<Geometry::Quadrilateral>] polygons List of coordinates to extract.
# @return [Array<Image::ExtractedImage>] Extracted Images.
# @return [Image::ExtractedImages] Extracted Images.
def self.extract_multiple_images_from_source(input_source, page_id, polygons)
new_stream = load_input_source_pdf_page_as_stringio(input_source, page_id)
new_stream.seek(0)

extract_images_from_polygons(input_source, page_id, polygons)
ExtractedImages.new(extract_images_from_polygons(input_source, page_id, polygons))
end

# Extracts images from their positions on a file (as polygons).
#
# @param [Input::Source::LocalInputSource] input_source Local input source.
# @param [Integer] page_id Page ID.
# @param [Array<Geometry::Point, Geometry::Polygon, Geometry::Quadrilateral>] polygons
# @return [Array<Image::ExtractedImage>] Extracted Images.
# @return [Image::ExtractedImages] Extracted Images.
def self.extract_images_from_polygons(input_source, page_id, polygons)
extracted_elements = [] # @type var extracted_elements: Array[Image::ExtractedImage]
extracted_elements = ExtractedImages.new # @type var extracted_elements: Image::ExtractedImages

input_source.io_stream.rewind
pdf_stream = StringIO.new(input_source.io_stream.read.to_s)
Expand All @@ -65,7 +66,7 @@ def self.extract_images_from_polygons(input_source, page_id, polygons)
min_max_x = Geometry.get_min_max_x(points)
min_max_y = Geometry.get_min_max_y(points)
file_extension = ImageUtils.determine_file_extension(input_source)
cropped_image = ImageUtils.crop_image(page_content, min_max_x, min_max_y)
cropped_image = crop_image(page_content, min_max_x, min_max_y)
if file_extension == 'pdf'
cropped_image.format('jpg')
else
Expand Down Expand Up @@ -102,7 +103,7 @@ def self.create_extracted_image(buffer, file_name, page_id, element_id)
#
# @param input_file [LocalInputSource] Local input.
# @param [Integer] page_id Page ID.
# @return [StringIO] A valid PdfDocument handle.
# @return [StringIO] A valid PDFDocument handle.
def self.load_input_source_pdf_page_as_stringio(input_file, page_id)
input_file.io_stream.rewind
if input_file.pdf?
Expand All @@ -111,6 +112,23 @@ def self.load_input_source_pdf_page_as_stringio(input_file, page_id)
input_file.io_stream
end
end

# Crops a MiniMagick Image from the given bounding box.
#
# @param [MiniMagick::Image] image Input Image.
# @param [Mindee::Geometry::MinMax] min_max_x minimum & maximum values for the x coordinates.
# @param [Mindee::Geometry::MinMax] min_max_y minimum & maximum values for the y coordinates.
def self.crop_image(image, min_max_x, min_max_y)
width = image[:width].to_i
height = image[:height].to_i

image.format('jpg')
new_width = (min_max_x.max - min_max_x.min) * width
new_height = (min_max_y.max - min_max_y.min) * height
image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}")

image
end
end
end
end
17 changes: 0 additions & 17 deletions lib/mindee/image/image_utils.rb
Original file line number Diff line number Diff line change
Expand Up @@ -124,23 +124,6 @@ def self.read_page_content(pdf_stream)
MiniMagick::Image.read(pdf_stream)
end

# Crops a MiniMagick Image from a the given bounding box.
#
# @param [MiniMagick::Image] image Input Image.
# @param [Mindee::Geometry::MinMax] min_max_x minimum & maximum values for the x coordinates.
# @param [Mindee::Geometry::MinMax] min_max_y minimum & maximum values for the y coordinates.
def self.crop_image(image, min_max_x, min_max_y)
width = image[:width].to_i
height = image[:height].to_i

image.format('jpg')
new_width = (min_max_x.max - min_max_x.min) * width
new_height = (min_max_y.max - min_max_y.min) * height
image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}")

image
end

# Writes a MiniMagick::Image to a buffer.
#
# @param [MiniMagick::Image] image a valid MiniMagick image.
Expand Down
1 change: 1 addition & 0 deletions lib/mindee/pdf.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# frozen_string_literal: true

require_relative 'pdf/extracted_pdf'
require_relative 'pdf/extracted_pdfs'
require_relative 'pdf/pdf_compressor'
require_relative 'pdf/pdf_extractor'
require_relative 'pdf/pdf_processor'
Expand Down
2 changes: 1 addition & 1 deletion lib/mindee/pdf/extracted_pdf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
module Mindee
# PDF Extraction Module.
module PDF
# An extracted sub-Pdf.
# An extracted sub-PDF.
class ExtractedPDF
# Byte contents of the pdf
# @return [StringIO]
Expand Down
20 changes: 20 additions & 0 deletions lib/mindee/pdf/extracted_pdfs.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# frozen_string_literal: true

require_relative 'extracted_pdf'

module Mindee
# PDF Extraction Module.
module PDF
# List of extracted PDFs.
class ExtractedPDFs < Array
# Save all extracted PDFs to disk.
#
# @param output_path [String, Pathname] Directory path to save the extracted PDFs to.
def save_all_to_disk(output_path)
each do |pdf|
pdf.write_to_file(File.join(output_path.to_s, pdf.filename))
end
end
end
end
end
20 changes: 10 additions & 10 deletions lib/mindee/pdf/pdf_extractor.rb
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# frozen_string_literal: true

module Mindee
# Pdf Extraction Module.
# PDF Extraction Module.
module PDF
# Pdf extraction class.
# PDF extraction class.
class PDFExtractor
# @param local_input [Mindee::Input::Source::LocalInputSource]
def initialize(local_input)
Expand All @@ -23,15 +23,15 @@ def initialize(local_input)
end
end

# Retrieves the page count for the Pdf object.
# Retrieves the page count for the PDF object.
# @return [Integer]
def page_count
Mindee::PDF::PDFProcessor.open_pdf(@source_pdf).pages.size
end

# Creates a new Pdf from pages and save it into a buffer.
# @param page_indexes [Array<Integer>] List of page number to use for merging in the original Pdf.
# @return [StringIO] The buffer containing the new Pdf.
# Creates a new PDF from pages and save it into a buffer.
# @param page_indexes [Array<Integer>] List of page number to use for merging in the original PDF.
# @return [StringIO] The buffer containing the new PDF.
def cut_pages(page_indexes)
options = PageOptions.new(params: {
page_indexes: page_indexes,
Expand All @@ -41,10 +41,10 @@ def cut_pages(page_indexes)
end

# Extract the sub-documents from the main pdf, based on the given list of page indexes.
# @param page_indexes [Array<Array<Integer>>] List of page number to use for merging in the original Pdf.
# @return [Array<Mindee::PDF::ExtractedPDF>] The buffer containing the new Pdf.
# @param page_indexes [Array<Array<Integer>>] List of page number to use for merging in the original PDF.
# @return [Mindee::PDF::ExtractedPDFs] The buffer containing the new PDF.
def extract_sub_documents(page_indexes)
extracted_pdfs = [] # @type var extracted_pdfs: Array[Mindee::PDF::ExtractedPDF]
extracted_pdfs = ExtractedPDFs.new # @type var extracted_pdfs: Mindee::PDF::ExtractedPDFs
extension = File.extname(@filename)
basename = File.basename(@filename, extension)
page_indexes.each do |page_index_list|
Expand Down Expand Up @@ -74,7 +74,7 @@ def extract_sub_documents(page_indexes)
# Extracts invoices as complete PDFs from the document.
# @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1InvoicePageGroup>]
# @param strict [bool]
# @return [Array<Mindee::PDF::ExtractedPDF>]
# @return [Mindee::PDF::ExtractedPDFs]
def extract_invoices(page_indexes, strict: false)
raise Error::MindeePDFError, 'No indexes provided.' if page_indexes.empty?

Expand Down
2 changes: 0 additions & 2 deletions lib/mindee/v2/file_operations.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
# frozen_string_literal: true

require_relative 'file_operations/crop'
require_relative 'file_operations/crop_files'
require_relative 'file_operations/split'
require_relative 'file_operations/split_files'
13 changes: 7 additions & 6 deletions lib/mindee/v2/file_operations/crop.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def self.extract_single_crop(input_source, crop)
#
# @param input_source [LocalInputSource] Local Input Source to extract sub-receipts from.
# @param crops [Array<CropItem>] List of crops.
# @return [CropFiles] Individual extracted zones as an array of ExtractedImage.
# @return [Image::ExtractedImages] Individual extracted zones as an array of ExtractedImage.
# @raise [MindeeError] if the crops array is empty.
def self.extract_crops(input_source, crops)
if crops.nil? || crops.empty?
Expand All @@ -35,15 +35,16 @@ def self.extract_crops(input_source, crops)
polygons[crop.location.page] << crop.location.polygon
end

images = [] # @type var images: Array[Image::ExtractedImage]
images = Mindee::Image::ExtractedImages.new
polygons.each_with_index do |page_polygons, page_index|
extracted = Mindee::Image::ImageExtractor.extract_multiple_images_from_source(
input_source, page_index, page_polygons
images.concat(
Mindee::Image::ImageExtractor.extract_multiple_images_from_source(
input_source, page_index, page_polygons
)
)
images.concat(extracted)
end

CropFiles.new(images)
images
end
end
end
Expand Down
25 changes: 0 additions & 25 deletions lib/mindee/v2/file_operations/crop_files.rb

This file was deleted.

4 changes: 2 additions & 2 deletions lib/mindee/v2/file_operations/split.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def self.extract_single_split(input_source, split)
#
# @param input_source [LocalInputSource] Input source to split.
# @param splits [Array<Array<Integer>>] List of sub-lists of pages to keep.
# @return [SplitFiles] A list of extracted invoices.
# @return [PDF::ExtractedPDFs] A list of extracted invoices.
# @raise [MindeeError] if no indexes are provided.
def self.extract_splits(input_source, splits)
raise Mindee::Error::MindeeError, 'No indexes provided.' if splits.nil? || splits.empty?
Expand All @@ -29,7 +29,7 @@ def self.extract_splits(input_source, splits)
(split[0]..split[1]).to_a
end

SplitFiles.new(pdf_extractor.extract_sub_documents(page_groups))
pdf_extractor.extract_sub_documents(page_groups)
end
end
end
Expand Down
25 changes: 0 additions & 25 deletions lib/mindee/v2/file_operations/split_files.rb

This file was deleted.

2 changes: 1 addition & 1 deletion lib/mindee/v2/product/crop/crop_response.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def to_s
# Apply the crop inference to a file and return a list of extracted images.
#
# @param input_source [Mindee::Input::Source::LocalInputSource] Local file to extract from
# @return [FileOperation::CropFiles] List of extracted PDFs
# @return [Image::ExtractedImages] List of extracted images
def extract_from_file(input_source)
FileOperation::Crop.extract_crops(input_source, @inference.result.crops)
end
Expand Down
2 changes: 1 addition & 1 deletion lib/mindee/v2/product/split/split_response.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def to_s

# Splits the input PDF.
# @param input_source [Mindee::Input::Source::LocalInputSource] Path to the file or a File object.
# @return [FileOperation::SplitFiles]
# @return [PDF::ExtractedPDFs]
def extract_from_file(input_source)
splits = @inference.result.splits.map(&:page_range)
FileOperation::Split.extract_splits(input_source, splits)
Expand Down
1 change: 0 additions & 1 deletion sig/mindee/image/extracted_image.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ module Mindee
?preserve_input_filename: bool
) -> Integer
def write_to_file: (String, ?String?) -> void
def as_source: -> Input::Source::BytesInputSource
def as_input_source: -> Input::Source::BytesInputSource
end
end
Expand Down
8 changes: 8 additions & 0 deletions sig/mindee/image/extracted_images.rbs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# lib/mindee/image/extracted_images.rb
module Mindee
module Image
class ExtractedImages < Array[ExtractedImage]
def save_all_to_disk: (String | Pathname) -> void
end
end
end
Loading
Loading