Pull request #12: RED-6280: Performance Test Issue with OCR-Service

Merge in RED/ocr-service from RED-6280 to master

* commit '742725834933ad74ad582366b2b62015524bedb3':
  RED-6280:  Performance Test Issue with OCR-Service *removed init/terminate calls again *manual memory cleanup at every opportunity
This commit is contained in:
Kilian Schuettler 2023-02-28 10:30:58 +01:00
commit a4ca2db37d
11 changed files with 234 additions and 212 deletions

View File

@ -6,6 +6,8 @@ import lombok.SneakyThrows;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import javax.annotation.PostConstruct;
@Component
@RequiredArgsConstructor
public class PDFNetInitializer {
@ -18,14 +20,12 @@ public class PDFNetInitializer {
@SneakyThrows
@PostConstruct
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
public void init() {
PDFNet.initialize(pdftronLicense);
PDFNet.setTempPath("/tmp/pdftron");
PDFNet.addResourceSearchPath(ocrModulePath);
PDFNet.initialize(pdftronLicense);
}
}

View File

@ -47,16 +47,17 @@ public class ImagePositionRetrievalService {
ElementReader reader = new ElementReader();
for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) {
RectCollection imagePositions = new RectCollection();
reader.begin(pdfDoc.getPage(pageId));
reader.begin(pdfDoc.getPage(pageId));
findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
imagePositions = mergeOverlappingRects(imagePositions);
reader.end();
if (imagePositions.getNumRects() > 0) {
pageIdToImagePositions.put(pageId, imagePositions);
}
}
reader.destroy();
return pageIdToImagePositions;
}

View File

@ -77,7 +77,6 @@ public class InvisibleElementRemovalService {
visitedXObjIds.add(page.getSDFObj().getObjNum());
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
@ -92,8 +91,20 @@ public class InvisibleElementRemovalService {
context.visitedXObjIds().clear();
removeOverlappedElements(page, writer, context);
reader.end();
writer.end();
}
pdfDoc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);
try {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
} catch (Exception e) {
log.error("File could not be saved after invisible element removal");
throw new RuntimeException(e);
}
writer.destroy();
reader.destroy();
pdfDoc.close();
}
@ -216,13 +227,14 @@ public class InvisibleElementRemovalService {
formWriter.setDefaultGState(context.reader());
processElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
}
}
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
PathData pathData = pathElement.getPathData();
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) {
@ -352,7 +364,7 @@ public class InvisibleElementRemovalService {
formWriter.setDefaultGState(context.reader());
processOverlappedElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
}
}
@ -427,6 +439,9 @@ public class InvisibleElementRemovalService {
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect);
colorPt.destroy();
eb.destroy();
}

View File

@ -98,6 +98,7 @@ public class OCRService {
getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc);
singlePagePdfDoc.close();
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,

View File

@ -1,26 +1,23 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.http.HttpStatus;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
import com.pdftron.pdf.PDFNet;
import feign.FeignException;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.http.HttpStatus;
import org.springframework.stereotype.Service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@Slf4j
@Service
@ -30,7 +27,6 @@ public class OcrMessageReceiver {
private final ObjectMapper objectMapper;
private final FileStorageService fileStorageService;
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
private final PDFNetInitializer pdfNetInitializer;
private final OCRService ocrService;
@ -39,7 +35,6 @@ public class OcrMessageReceiver {
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
public void receiveOcr(String in) throws JsonProcessingException {
pdfNetInitializer.init();
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
@ -62,8 +57,6 @@ public class OcrMessageReceiver {
}
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
PDFNet.terminate();
}

View File

@ -1,24 +1,28 @@
package com.iqser.red.service.ocr.v1.server;
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
import org.junit.jupiter.api.*;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Primary;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import static org.assertj.core.api.Assertions.assertThat;
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@ -28,19 +32,14 @@ public class AbstractTest {
@Autowired
protected StorageService storageService;
@Autowired
private PDFNetInitializer pdfNetInitializer;
@MockBean
protected RabbitTemplate rabbitTemplate;
@BeforeEach
@SneakyThrows
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void initPDFNet() {
pdfNetInitializer.init();
}
@AfterAll
public static void terminatePDFNet() {
PDFNet.terminate();
System.out.println("PDFNet Terminated");
}

View File

@ -1,33 +1,29 @@
package com.iqser.red.service.ocr.v1.server;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.concurrent.TimeUnit;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.core.io.ClassPathResource;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
import com.iqser.red.service.ocr.v1.server.service.OCRService;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.pdf.OCRModule;
import io.micrometer.prometheus.PrometheusMeterRegistry;
import io.micrometer.prometheus.PrometheusTimer;
import lombok.SneakyThrows;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
@SpringBootTest(properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
public class OcrServiceIntegrationTest extends AbstractTest {
@ -35,9 +31,6 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@Autowired
protected ObjectMapper objectMapper;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Autowired
private OCRService ocrService;
@ -139,25 +132,5 @@ public class OcrServiceIntegrationTest extends AbstractTest {
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
return extractAllTextFromDocument(fileStream);
}
}
private static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
PDFDoc pdfDoc = new PDFDoc(fileStream);
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
return String.join("\n", texts);
}
}

View File

@ -1,15 +1,9 @@
package com.iqser.red.service.ocr.v1.server.service;
import com.iqser.red.service.ocr.v1.server.AbstractTest;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import org.junit.jupiter.api.Test;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static com.iqser.red.service.ocr.v1.server.utils.PdfDraw.drawGrid;
import static com.iqser.red.service.ocr.v1.server.utils.PdfDraw.drawRectCollection;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream;
@ -20,8 +14,18 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.service.ocr.v1.server.AbstractTest;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
class ImagePositionRetrievalServiceTest extends AbstractTest {
@ -29,10 +33,6 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
@Autowired
private ImagePositionRetrievalService imagePositionRetrievalService;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Test
@SneakyThrows
public void testImagePositionRetrievalForRotateTestFileWithImages() {
@ -116,37 +116,38 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {
InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath());
PDFDoc pdfDoc = new PDFDoc(fileStream);
try (InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath())) {
PDFDoc pdfDoc = new PDFDoc(fileStream);
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false);
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false);
ElementWriter writer = new ElementWriter();
pageIdToRectCollection.forEach((pageId, rectCollection) -> {
try {
writer.begin(pdfDoc.getPage(pageId));
drawRectCollection(writer, rectCollection, pdfDoc.getPage(pageId));
drawGrid(writer, pdfDoc.getPage(pageId));
writer.end();
StringBuilder zonesString = new StringBuilder();
for (int j = 0; j < rectCollection.getNumRects(); ++j) {
var r = rectCollection.getRectAt(j);
zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2()));
ElementWriter writer = new ElementWriter();
pageIdToRectCollection.forEach((pageId, rectCollection) -> {
try {
writer.begin(pdfDoc.getPage(pageId));
drawRectCollection(writer, rectCollection);
drawGrid(writer, pdfDoc.getPage(pageId));
writer.end();
StringBuilder zonesString = new StringBuilder();
for (int j = 0; j < rectCollection.getNumRects(); ++j) {
var r = rectCollection.getRectAt(j);
zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2()));
}
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString);
} catch (PDFNetException e) {
throw new RuntimeException(e);
}
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString);
} catch (PDFNetException e) {
throw new RuntimeException(e);
}
});
});
// Check visually for red Rectangles to match images in the saved pdf file
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) {
out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
// Check visually for red Rectangles to match images in the saved pdf file
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) {
out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
}
pdfDoc.close();
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
}
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
fileStream.close();
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
}
@ -161,59 +162,4 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
return coords;
}
@SneakyThrows
private void drawGrid(ElementWriter writer, Page page) {
ElementBuilder eb = new ElementBuilder();
double dX = 15;
double dY = 15;
int nRows = (int) (page.getPageHeight() / dY) + 1;
int nCols = (int) (page.getPageWidth() / dX) + 1;
for (int row = 0; row < nRows; ++row) {
for (int col = 0; col < nCols; ++col) {
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
cell.setPathStroke(true);
cell.getGState().setLineWidth(1);
cell.getGState().setStrokeOpacity(0.1);
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
if (row == 0 && col == 0) {
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
cell.setPathFill(true);
cell.getGState().setFillOpacity(0.8);
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
} else {
cell.setPathFill(false);
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
}
writer.writePlacedElement(cell);
}
}
}
@SneakyThrows
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection, Page page) {
ColorPt colorPt = new ColorPt(1, 0, 0);
ElementBuilder eb = new ElementBuilder();
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
Rect r = rectCollection.getRectAt(i);
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setLineWidth(5);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
rect.setPathFill(true);
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setFillColor(colorPt);
rect.getGState().setFillOpacity(0.5);
writer.writePlacedElement(rect);
}
}
}

View File

@ -1,31 +1,25 @@
package com.iqser.red.service.ocr.v1.server.service;
import com.iqser.red.service.ocr.v1.server.AbstractTest;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import lombok.SneakyThrows;
import org.junit.jupiter.api.Test;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.service.ocr.v1.server.AbstractTest;
import lombok.SneakyThrows;
public class InvisibleElementRemovalServiceTest extends AbstractTest {
@Autowired
private InvisibleElementRemovalService invisibleElementRemovalService;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Test
@SneakyThrows
@ -36,27 +30,18 @@ public class InvisibleElementRemovalServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,false);
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, false);
}
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_delta.pdf")) {
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,true);
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, true);
}
System.out.println("Output File without invisible elements: files/" + fileName + ".pdf");
System.out.println("Output Delta File without invisible elements: files/" + fileName + "_delta.pdf");
TextExtractor extractor = new TextExtractor();
PDFDoc pdfDoc;
try(var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
pdfDoc = new PDFDoc(fileStream);
}
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
String[] text = extractor.getAsText().split("\n");
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
String[] text = extractAllTextFromDocument(fileStream).split("\n");
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
}
}

View File

@ -0,0 +1,74 @@
package com.iqser.red.service.ocr.v1.server.utils;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import lombok.SneakyThrows;
public class PdfDraw {
@SneakyThrows
public static void drawGrid(ElementWriter writer, Page page) {
ElementBuilder eb = new ElementBuilder();
double dX = 15;
double dY = 15;
int nRows = (int) (page.getPageHeight() / dY) + 1;
int nCols = (int) (page.getPageWidth() / dX) + 1;
for (int row = 0; row < nRows; ++row) {
for (int col = 0; col < nCols; ++col) {
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
cell.setPathStroke(true);
cell.getGState().setLineWidth(1);
cell.getGState().setStrokeOpacity(0.1);
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
if (row == 0 && col == 0) {
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
cell.setPathFill(true);
cell.getGState().setFillOpacity(0.8);
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
} else {
cell.setPathFill(false);
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
}
writer.writePlacedElement(cell);
}
}
eb.destroy();
}
@SneakyThrows
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection) {
ColorPt colorPt = new ColorPt(1, 0, 0);
ElementBuilder eb = new ElementBuilder();
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
Rect r = rectCollection.getRectAt(i);
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setLineWidth(5);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
rect.setPathFill(true);
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setFillColor(colorPt);
rect.getGState().setFillOpacity(0.5);
writer.writePlacedElement(rect);
}
colorPt.destroy();
eb.destroy();
}
}

View File

@ -0,0 +1,35 @@
package com.iqser.red.service.ocr.v1.server.utils;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
public class PdfTextExtraction {
public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
PDFDoc pdfDoc = new PDFDoc(fileStream);
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
extractor.destroy();
pdfDoc.close();
return String.join("\n", texts);
}
}