diff --git a/src/main/java/com/mindee/image/ExtractedImage.java b/src/main/java/com/mindee/image/ExtractedImage.java index 0767a381b..c8c63606a 100644 --- a/src/main/java/com/mindee/image/ExtractedImage.java +++ b/src/main/java/com/mindee/image/ExtractedImage.java @@ -1,5 +1,6 @@ package com.mindee.image; +import com.mindee.MindeeException; import com.mindee.input.LocalInputSource; import java.awt.image.BufferedImage; import java.io.ByteArrayOutputStream; @@ -62,9 +63,13 @@ public void writeToFile(String outputPath) throws IOException { */ public void writeToFile(Path outputPath) throws IOException { if (!Files.isDirectory(outputPath)) { - throw new IllegalArgumentException("Provided path is not a directory."); + throw new MindeeException("Provided path is not a directory."); + } + try { + ImageIO.write(this.image, this.saveFormat, outputPath.resolve(this.filename).toFile()); + } catch (IOException e) { + throw new MindeeException("Could not save file " + this.filename + ".", e); } - ImageIO.write(this.image, this.saveFormat, outputPath.resolve(this.filename).toFile()); } /** diff --git a/src/main/java/com/mindee/pdf/BasePDFExtractor.java b/src/main/java/com/mindee/pdf/BasePDFExtractor.java index da55a863c..c383b439c 100644 --- a/src/main/java/com/mindee/pdf/BasePDFExtractor.java +++ b/src/main/java/com/mindee/pdf/BasePDFExtractor.java @@ -55,15 +55,19 @@ public BasePDFExtractor(LocalInputSource source) throws IOException { } } - public ExtractedPDF extractSinglePage( - List pageNumbers, + public ExtractedPDF extractSingleDocument( + List pageIndexes, boolean closeOriginal ) throws IOException { - if (pageNumbers.isEmpty()) { + if (pageIndexes.isEmpty()) { throw new MindeeException("Empty indexes not allowed for extraction."); } - var pdfBytes = createPdfFromExistingPdf(this.sourcePdf, pageNumbers, closeOriginal); - return new ExtractedPDF(pdfBytes, makeFilename(pageNumbers)); + var pdfBytes = createPdfFromExistingPdf(this.sourcePdf, pageIndexes, closeOriginal); + return new ExtractedPDF( + pdfBytes, + makeFilename(pageIndexes), + pageIndexes.stream().mapToInt(Integer::intValue).toArray() + ); } /** @@ -73,11 +77,13 @@ public ExtractedPDF extractSinglePage( * @return A list of extracted files. * @throws IOException Throws if the file can't be accessed. */ - public ExtractedPDFs extractSubDocuments(List> pageIndexes) throws IOException { + public ExtractedPDFs extractMultipleDocuments( + List> pageIndexes + ) throws IOException { var extractedPDFs = new ExtractedPDFs(); for (List pageIndexElement : pageIndexes) { - extractedPDFs.add(extractSinglePage(pageIndexElement, false)); + extractedPDFs.add(extractSingleDocument(pageIndexElement, false)); } return extractedPDFs; } diff --git a/src/main/java/com/mindee/pdf/ExtractedPDF.java b/src/main/java/com/mindee/pdf/ExtractedPDF.java index 52f7fc10f..8ec27dc74 100644 --- a/src/main/java/com/mindee/pdf/ExtractedPDF.java +++ b/src/main/java/com/mindee/pdf/ExtractedPDF.java @@ -1,5 +1,6 @@ package com.mindee.pdf; +import com.mindee.MindeeException; import com.mindee.input.LocalInputSource; import java.io.IOException; import java.nio.file.Files; @@ -12,41 +13,59 @@ */ @Getter public class ExtractedPDF { + /** + * PDF content as bytes. + */ private final byte[] fileBytes; + /** + * Name of the file when writing to disk. + */ private final String filename; + /** + * 0-based indexes of all pages taken from the original PDF. + */ + private final int[] pageIndexes; + /** + * The number of pages in this PDF file. + */ + private final int pageCount; /** * Default constructor. * * @param fileBytes PDF file as bytes. * @param filename Name of the extracted file. + * @param pageIndexes Two-element array: index of the first and last extracted page. */ - public ExtractedPDF(byte[] fileBytes, String filename) { + public ExtractedPDF(byte[] fileBytes, String filename, int[] pageIndexes) { this.fileBytes = fileBytes; this.filename = filename; + this.pageIndexes = pageIndexes; + this.pageCount = pageIndexes.length; } /** * Write the extracted PDF to a file. * * @param outputPath the output path, it may be a file or a directory. - * @throws IOException Throws if the file can't be accessed. */ - public void writeToFile(Path outputPath) throws IOException { + public void writeToFile(Path outputPath) throws MindeeException { if (!Files.isDirectory(outputPath)) { - throw new IllegalArgumentException("Provided path is not a directory."); + throw new MindeeException("Provided path is not a directory."); + } + try { + Files.write(outputPath.resolve(this.filename), this.fileBytes); + } catch (IOException e) { + throw new MindeeException("Could not save file " + this.filename + ".", e); } - - Files.write(outputPath.resolve(this.filename), this.fileBytes); } /** * Write the extracted PDF to a file. * * @param outputPath the output path, it may be a file or a directory. - * @throws IOException Throws if the file can't be accessed. */ - public void writeToFile(String outputPath) throws IOException { + public void writeToFile(String outputPath) throws MindeeException { writeToFile(Paths.get(outputPath)); } @@ -54,9 +73,8 @@ public void writeToFile(String outputPath) throws IOException { * Return the file in a format suitable for sending to MindeeClient for parsing. * * @return an instance of {@link LocalInputSource} - * @throws IOException Throws if the file can't be accessed. */ - public LocalInputSource asInputSource() throws IOException { + public LocalInputSource asInputSource() { return new LocalInputSource(this.fileBytes, this.filename); } } diff --git a/src/main/java/com/mindee/v1/fileoperations/PDFExtractor.java b/src/main/java/com/mindee/v1/fileoperations/PDFExtractor.java index 290c45015..6b3905368 100644 --- a/src/main/java/com/mindee/v1/fileoperations/PDFExtractor.java +++ b/src/main/java/com/mindee/v1/fileoperations/PDFExtractor.java @@ -40,7 +40,7 @@ public List extractInvoices( .map(InvoiceSplitterV1InvoicePageGroup::getPageIndexes) .collect(Collectors.toList()); - return extractSubDocuments(indexes); + return extractMultipleDocuments(indexes); } /** @@ -81,7 +81,7 @@ public List extractInvoices( } previousConfidence = confidence; } - return extractSubDocuments(correctPageIndexes); + return extractMultipleDocuments(correctPageIndexes); } } diff --git a/src/main/java/com/mindee/v2/fileoperations/Split.java b/src/main/java/com/mindee/v2/fileoperations/Split.java index 321e9f09f..5bf6726c9 100644 --- a/src/main/java/com/mindee/v2/fileoperations/Split.java +++ b/src/main/java/com/mindee/v2/fileoperations/Split.java @@ -17,12 +17,12 @@ public Split(LocalInputSource inputSource) throws IOException { } public ExtractedPDF extractSingleSplit(SplitRange splitRange) throws IOException { - return this.pdfSplitter.extractSinglePage(splitRange.getPageRangeDistinct(), true); + return this.pdfSplitter.extractSingleDocument(splitRange.getPageRangeDistinct(), true); } public ExtractedPDFs extractMultipleSplits(ArrayList splitRanges) throws IOException { return this.pdfSplitter - .extractSubDocuments( + .extractMultipleDocuments( splitRanges.stream().map(SplitRange::getPageRangeDistinct).collect(Collectors.toList()) ); } diff --git a/src/test/java/com/mindee/v2/fileoperations/SplitTest.java b/src/test/java/com/mindee/v2/fileoperations/SplitTest.java index 0e25e99a4..4747cda5a 100644 --- a/src/test/java/com/mindee/v2/fileoperations/SplitTest.java +++ b/src/test/java/com/mindee/v2/fileoperations/SplitTest.java @@ -3,6 +3,7 @@ import static com.mindee.TestingUtilities.deleteRecursively; import static com.mindee.TestingUtilities.getResourcePath; import static com.mindee.TestingUtilities.getV2ResourcePath; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import com.mindee.input.LocalInputSource; @@ -34,8 +35,7 @@ void singlePage_splitsCorrectly() throws IOException { .extractSingleSplit(doc.getInference().getResult().getSplits().get(0)); assertEquals("default_sample_pages-001-001.pdf", extractedSplit.getFilename()); - var asInputSource = extractedSplit.asInputSource(); - assertEquals(1, asInputSource.getPageCount()); + assertEquals(1, extractedSplit.getPageCount()); extractedSplit.writeToFile(outputPath); } @@ -54,13 +54,13 @@ void multiplePages_splitsCorrectly() throws IOException { var split0 = extractedSplits.get(0); assertEquals("default_sample_pages-001-001.pdf", split0.getFilename()); - var asInputSource0 = split0.asInputSource(); - assertEquals(1, asInputSource0.getPageCount()); + assertEquals(1, split0.getPageCount()); + assertArrayEquals(new int[] { 0 }, split0.getPageIndexes()); var split1 = extractedSplits.get(1); assertEquals("default_sample_pages-002-002.pdf", split1.getFilename()); - var asInputSource1 = split1.asInputSource(); - assertEquals(1, asInputSource1.getPageCount()); + assertEquals(1, split0.getPageCount()); + assertArrayEquals(new int[] { 1 }, split1.getPageIndexes()); extractedSplits.saveAllToDisk(outputPath); }