Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
90b4869761 |
@ -6,6 +6,8 @@ import lombok.SneakyThrows;
|
|||||||
import org.springframework.beans.factory.annotation.Value;
|
import org.springframework.beans.factory.annotation.Value;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
import javax.annotation.PostConstruct;
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class PDFNetInitializer {
|
public class PDFNetInitializer {
|
||||||
@ -18,14 +20,12 @@ public class PDFNetInitializer {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
|
@PostConstruct
|
||||||
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
|
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
|
||||||
public void init() {
|
public void init() {
|
||||||
|
|
||||||
PDFNet.initialize(pdftronLicense);
|
|
||||||
PDFNet.setTempPath("/tmp/pdftron");
|
PDFNet.setTempPath("/tmp/pdftron");
|
||||||
PDFNet.addResourceSearchPath(ocrModulePath);
|
PDFNet.addResourceSearchPath(ocrModulePath);
|
||||||
|
PDFNet.initialize(pdftronLicense);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,2 @@
|
|||||||
|
package com.iqser.red.service.ocr.v1.server.service;public class DataExtractionService {
|
||||||
|
}
|
||||||
@ -47,16 +47,17 @@ public class ImagePositionRetrievalService {
|
|||||||
ElementReader reader = new ElementReader();
|
ElementReader reader = new ElementReader();
|
||||||
for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) {
|
for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) {
|
||||||
RectCollection imagePositions = new RectCollection();
|
RectCollection imagePositions = new RectCollection();
|
||||||
reader.begin(pdfDoc.getPage(pageId));
|
|
||||||
|
|
||||||
|
reader.begin(pdfDoc.getPage(pageId));
|
||||||
findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
|
findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
|
||||||
imagePositions = mergeOverlappingRects(imagePositions);
|
imagePositions = mergeOverlappingRects(imagePositions);
|
||||||
|
|
||||||
reader.end();
|
reader.end();
|
||||||
|
|
||||||
if (imagePositions.getNumRects() > 0) {
|
if (imagePositions.getNumRects() > 0) {
|
||||||
pageIdToImagePositions.put(pageId, imagePositions);
|
pageIdToImagePositions.put(pageId, imagePositions);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
reader.destroy();
|
||||||
return pageIdToImagePositions;
|
return pageIdToImagePositions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -87,13 +87,27 @@ public class InvisibleElementRemovalService {
|
|||||||
.visitedXObjIds(visitedXObjIds)
|
.visitedXObjIds(visitedXObjIds)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
|
|
||||||
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
|
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
|
||||||
|
|
||||||
context.visitedXObjIds().clear();
|
context.visitedXObjIds().clear();
|
||||||
|
|
||||||
removeOverlappedElements(page, writer, context);
|
removeOverlappedElements(page, writer, context);
|
||||||
|
reader.end();
|
||||||
|
writer.end();
|
||||||
}
|
}
|
||||||
pdfDoc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);
|
|
||||||
|
|
||||||
|
try {
|
||||||
|
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error("File could not be saved after invisible element removal");
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.destroy();
|
||||||
|
reader.destroy();
|
||||||
|
pdfDoc.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -216,7 +230,7 @@ public class InvisibleElementRemovalService {
|
|||||||
formWriter.setDefaultGState(context.reader());
|
formWriter.setDefaultGState(context.reader());
|
||||||
|
|
||||||
processElements(formWriter, context);
|
processElements(formWriter, context);
|
||||||
formWriter.end();
|
formWriter.destroy();
|
||||||
context.reader().end();
|
context.reader().end();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -352,7 +366,7 @@ public class InvisibleElementRemovalService {
|
|||||||
formWriter.setDefaultGState(context.reader());
|
formWriter.setDefaultGState(context.reader());
|
||||||
|
|
||||||
processOverlappedElements(formWriter, context);
|
processOverlappedElements(formWriter, context);
|
||||||
formWriter.end();
|
formWriter.destroy();
|
||||||
context.reader().end();
|
context.reader().end();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -376,7 +390,8 @@ public class InvisibleElementRemovalService {
|
|||||||
switch (operator) {
|
switch (operator) {
|
||||||
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
|
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
|
||||||
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
|
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
|
||||||
case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
|
case PathData.e_cubicto ->
|
||||||
|
linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
|
||||||
case PathData.e_closepath -> linePath.closePath();
|
case PathData.e_closepath -> linePath.closePath();
|
||||||
case PathData.e_rect -> {
|
case PathData.e_rect -> {
|
||||||
double x = points.next();
|
double x = points.next();
|
||||||
@ -427,6 +442,9 @@ public class InvisibleElementRemovalService {
|
|||||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||||
rect.getGState().setStrokeColor(colorPt);
|
rect.getGState().setStrokeColor(colorPt);
|
||||||
writer.writePlacedElement(rect);
|
writer.writePlacedElement(rect);
|
||||||
|
|
||||||
|
colorPt.destroy();
|
||||||
|
eb.destroy();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -98,6 +98,7 @@ public class OCRService {
|
|||||||
getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
|
getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
|
||||||
|
|
||||||
replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc);
|
replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc);
|
||||||
|
|
||||||
singlePagePdfDoc.close();
|
singlePagePdfDoc.close();
|
||||||
|
|
||||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||||
|
|||||||
@ -1,26 +1,23 @@
|
|||||||
package com.iqser.red.service.ocr.v1.server.service;
|
package com.iqser.red.service.ocr.v1.server.service;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
|
|
||||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
|
||||||
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
|
||||||
import org.springframework.http.HttpStatus;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
|
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
|
||||||
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
||||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||||
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
|
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
|
||||||
import com.pdftron.pdf.PDFNet;
|
|
||||||
|
|
||||||
import feign.FeignException;
|
import feign.FeignException;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
|
||||||
|
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||||
|
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
||||||
|
import org.springframework.http.HttpStatus;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@Service
|
@Service
|
||||||
@ -30,7 +27,6 @@ public class OcrMessageReceiver {
|
|||||||
private final ObjectMapper objectMapper;
|
private final ObjectMapper objectMapper;
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
|
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
|
||||||
private final PDFNetInitializer pdfNetInitializer;
|
|
||||||
|
|
||||||
private final OCRService ocrService;
|
private final OCRService ocrService;
|
||||||
|
|
||||||
@ -39,7 +35,6 @@ public class OcrMessageReceiver {
|
|||||||
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
|
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
|
||||||
public void receiveOcr(String in) throws JsonProcessingException {
|
public void receiveOcr(String in) throws JsonProcessingException {
|
||||||
|
|
||||||
pdfNetInitializer.init();
|
|
||||||
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
|
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
|
||||||
|
|
||||||
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||||
@ -62,8 +57,6 @@ public class OcrMessageReceiver {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||||
PDFNet.terminate();
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,24 +1,28 @@
|
|||||||
package com.iqser.red.service.ocr.v1.server;
|
package com.iqser.red.service.ocr.v1.server;
|
||||||
|
|
||||||
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
|
|
||||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
import org.junit.jupiter.api.AfterAll;
|
||||||
import com.iqser.red.storage.commons.service.StorageService;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import com.pdftron.pdf.PDFNet;
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import org.junit.jupiter.api.*;
|
|
||||||
import org.junit.jupiter.api.extension.ExtendWith;
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||||
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
||||||
import org.springframework.boot.test.context.SpringBootTest;
|
import org.springframework.boot.test.context.SpringBootTest;
|
||||||
|
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||||
import org.springframework.context.annotation.Bean;
|
import org.springframework.context.annotation.Bean;
|
||||||
import org.springframework.context.annotation.Configuration;
|
import org.springframework.context.annotation.Configuration;
|
||||||
import org.springframework.context.annotation.Import;
|
import org.springframework.context.annotation.Import;
|
||||||
import org.springframework.context.annotation.Primary;
|
import org.springframework.context.annotation.Primary;
|
||||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||||
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
|
||||||
|
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||||
|
import com.iqser.red.storage.commons.service.StorageService;
|
||||||
|
import com.pdftron.pdf.PDFNet;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
@ExtendWith(SpringExtension.class)
|
@ExtendWith(SpringExtension.class)
|
||||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||||
@ -28,19 +32,14 @@ public class AbstractTest {
|
|||||||
@Autowired
|
@Autowired
|
||||||
protected StorageService storageService;
|
protected StorageService storageService;
|
||||||
|
|
||||||
@Autowired
|
@MockBean
|
||||||
private PDFNetInitializer pdfNetInitializer;
|
protected RabbitTemplate rabbitTemplate;
|
||||||
|
|
||||||
@BeforeEach
|
|
||||||
@SneakyThrows
|
|
||||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
|
||||||
public void initPDFNet() {
|
|
||||||
pdfNetInitializer.init();
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterAll
|
@AfterAll
|
||||||
public static void terminatePDFNet() {
|
public static void terminatePDFNet() {
|
||||||
PDFNet.terminate();
|
PDFNet.terminate();
|
||||||
|
System.out.println("PDFNet Terminated");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,43 +1,36 @@
|
|||||||
package com.iqser.red.service.ocr.v1.server;
|
package com.iqser.red.service.ocr.v1.server;
|
||||||
|
|
||||||
|
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||||
|
import static com.iqser.red.service.ocr.v1.server.utils.PdfUtils.extractAllTextFromDocument;
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.boot.test.context.SpringBootTest;
|
||||||
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
|
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
|
||||||
import com.iqser.red.service.ocr.v1.server.service.OCRService;
|
import com.iqser.red.service.ocr.v1.server.service.OCRService;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
|
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
|
||||||
import com.pdftron.common.PDFNetException;
|
import com.pdftron.pdf.OCRModule;
|
||||||
import com.pdftron.pdf.*;
|
|
||||||
import io.micrometer.prometheus.PrometheusMeterRegistry;
|
import io.micrometer.prometheus.PrometheusMeterRegistry;
|
||||||
import io.micrometer.prometheus.PrometheusTimer;
|
import io.micrometer.prometheus.PrometheusTimer;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
|
||||||
import org.junit.jupiter.api.Disabled;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
|
||||||
import org.springframework.boot.test.context.SpringBootTest;
|
|
||||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
|
||||||
import org.springframework.core.io.ClassPathResource;
|
|
||||||
|
|
||||||
import java.io.FileInputStream;
|
@SpringBootTest(properties = {"pdftron.ocrmodule.path=/home/kschuettler/iqser/PDFTron/ocr/Lib/"})
|
||||||
import java.io.FileOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
|
||||||
|
|
||||||
@SpringBootTest(properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
|
|
||||||
public class OcrServiceIntegrationTest extends AbstractTest {
|
public class OcrServiceIntegrationTest extends AbstractTest {
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
protected ObjectMapper objectMapper;
|
protected ObjectMapper objectMapper;
|
||||||
|
|
||||||
@MockBean
|
|
||||||
protected RabbitTemplate rabbitTemplate;
|
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
private OCRService ocrService;
|
private OCRService ocrService;
|
||||||
|
|
||||||
@ -72,6 +65,29 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
|||||||
public void testOcr() {
|
public void testOcr() {
|
||||||
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
|
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
|
||||||
String text = testOCR("StitchedImagesMultiPage");
|
String text = testOCR("StitchedImagesMultiPage");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
testOCR("131 IDD0000261725");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -139,25 +155,5 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
|||||||
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||||
return extractAllTextFromDocument(fileStream);
|
return extractAllTextFromDocument(fileStream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
|
|
||||||
|
|
||||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
|
||||||
TextExtractor extractor = new TextExtractor();
|
|
||||||
List<String> texts = new ArrayList<>();
|
|
||||||
|
|
||||||
PageIterator iterator = pdfDoc.getPageIterator();
|
|
||||||
while (iterator.hasNext()) {
|
|
||||||
Page page = iterator.next();
|
|
||||||
extractor.begin(page);
|
|
||||||
texts.add(extractor.getAsText());
|
|
||||||
}
|
|
||||||
|
|
||||||
return String.join("\n", texts);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,14 @@
|
|||||||
|
package com.iqser.red.service.ocr.v1.server.service;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
class DataExtractionServiceTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void extractData() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,15 +1,9 @@
|
|||||||
package com.iqser.red.service.ocr.v1.server.service;
|
package com.iqser.red.service.ocr.v1.server.service;
|
||||||
|
|
||||||
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||||
import com.pdftron.common.PDFNetException;
|
import static com.iqser.red.service.ocr.v1.server.utils.PdfUtils.drawGrid;
|
||||||
import com.pdftron.pdf.*;
|
import static com.iqser.red.service.ocr.v1.server.utils.PdfUtils.drawRectCollection;
|
||||||
import com.pdftron.sdf.SDFDoc;
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
|
||||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
|
||||||
import org.springframework.core.io.ClassPathResource;
|
|
||||||
|
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
@ -20,8 +14,18 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
import org.junit.jupiter.api.Test;
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
|
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
||||||
|
import com.pdftron.common.PDFNetException;
|
||||||
|
import com.pdftron.pdf.ElementWriter;
|
||||||
|
import com.pdftron.pdf.PDFDoc;
|
||||||
|
import com.pdftron.pdf.RectCollection;
|
||||||
|
import com.pdftron.sdf.SDFDoc;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
|
|
||||||
class ImagePositionRetrievalServiceTest extends AbstractTest {
|
class ImagePositionRetrievalServiceTest extends AbstractTest {
|
||||||
@ -29,10 +33,6 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
|
|||||||
@Autowired
|
@Autowired
|
||||||
private ImagePositionRetrievalService imagePositionRetrievalService;
|
private ImagePositionRetrievalService imagePositionRetrievalService;
|
||||||
|
|
||||||
@MockBean
|
|
||||||
protected RabbitTemplate rabbitTemplate;
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testImagePositionRetrievalForRotateTestFileWithImages() {
|
public void testImagePositionRetrievalForRotateTestFileWithImages() {
|
||||||
@ -116,37 +116,38 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {
|
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {
|
||||||
|
|
||||||
InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath());
|
try (InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath())) {
|
||||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||||
|
|
||||||
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false);
|
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false);
|
||||||
|
|
||||||
ElementWriter writer = new ElementWriter();
|
ElementWriter writer = new ElementWriter();
|
||||||
pageIdToRectCollection.forEach((pageId, rectCollection) -> {
|
pageIdToRectCollection.forEach((pageId, rectCollection) -> {
|
||||||
try {
|
try {
|
||||||
writer.begin(pdfDoc.getPage(pageId));
|
writer.begin(pdfDoc.getPage(pageId));
|
||||||
drawRectCollection(writer, rectCollection, pdfDoc.getPage(pageId));
|
drawRectCollection(writer, rectCollection);
|
||||||
drawGrid(writer, pdfDoc.getPage(pageId));
|
drawGrid(writer, pdfDoc.getPage(pageId));
|
||||||
writer.end();
|
writer.end();
|
||||||
StringBuilder zonesString = new StringBuilder();
|
StringBuilder zonesString = new StringBuilder();
|
||||||
for (int j = 0; j < rectCollection.getNumRects(); ++j) {
|
for (int j = 0; j < rectCollection.getNumRects(); ++j) {
|
||||||
var r = rectCollection.getRectAt(j);
|
var r = rectCollection.getRectAt(j);
|
||||||
zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2()));
|
zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2()));
|
||||||
|
}
|
||||||
|
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString);
|
||||||
|
} catch (PDFNetException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString);
|
});
|
||||||
} catch (PDFNetException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Check visually for red Rectangles to match images in the saved pdf file
|
// Check visually for red Rectangles to match images in the saved pdf file
|
||||||
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) {
|
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) {
|
||||||
out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
|
out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
|
||||||
|
}
|
||||||
|
pdfDoc.close();
|
||||||
|
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
|
||||||
|
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
|
||||||
|
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
|
|
||||||
fileStream.close();
|
|
||||||
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
|
|
||||||
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -161,59 +162,4 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
|
|||||||
return coords;
|
return coords;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private void drawGrid(ElementWriter writer, Page page) {
|
|
||||||
|
|
||||||
ElementBuilder eb = new ElementBuilder();
|
|
||||||
double dX = 15;
|
|
||||||
double dY = 15;
|
|
||||||
int nRows = (int) (page.getPageHeight() / dY) + 1;
|
|
||||||
int nCols = (int) (page.getPageWidth() / dX) + 1;
|
|
||||||
for (int row = 0; row < nRows; ++row) {
|
|
||||||
for (int col = 0; col < nCols; ++col) {
|
|
||||||
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
|
|
||||||
cell.setPathStroke(true);
|
|
||||||
cell.getGState().setLineWidth(1);
|
|
||||||
cell.getGState().setStrokeOpacity(0.1);
|
|
||||||
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
|
||||||
if (row == 0 && col == 0) {
|
|
||||||
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
|
|
||||||
cell.setPathFill(true);
|
|
||||||
cell.getGState().setFillOpacity(0.8);
|
|
||||||
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
|
||||||
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
|
|
||||||
} else {
|
|
||||||
cell.setPathFill(false);
|
|
||||||
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
|
|
||||||
}
|
|
||||||
writer.writePlacedElement(cell);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection, Page page) {
|
|
||||||
|
|
||||||
ColorPt colorPt = new ColorPt(1, 0, 0);
|
|
||||||
ElementBuilder eb = new ElementBuilder();
|
|
||||||
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
|
|
||||||
Rect r = rectCollection.getRectAt(i);
|
|
||||||
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
|
|
||||||
|
|
||||||
rect.setPathStroke(true);
|
|
||||||
rect.getGState().setLineWidth(5);
|
|
||||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
|
||||||
rect.getGState().setStrokeColor(colorPt);
|
|
||||||
|
|
||||||
rect.setPathFill(true);
|
|
||||||
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
|
||||||
rect.getGState().setFillColor(colorPt);
|
|
||||||
rect.getGState().setFillOpacity(0.5);
|
|
||||||
|
|
||||||
writer.writePlacedElement(rect);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -1,31 +1,25 @@
|
|||||||
package com.iqser.red.service.ocr.v1.server.service;
|
package com.iqser.red.service.ocr.v1.server.service;
|
||||||
|
|
||||||
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||||
import com.pdftron.pdf.PDFDoc;
|
import static com.iqser.red.service.ocr.v1.server.utils.PdfUtils.extractAllTextFromDocument;
|
||||||
import com.pdftron.pdf.Page;
|
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||||
import com.pdftron.pdf.PageIterator;
|
|
||||||
import com.pdftron.pdf.TextExtractor;
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
|
||||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
|
||||||
import org.springframework.core.io.ClassPathResource;
|
|
||||||
|
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
|
|
||||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
import org.junit.jupiter.api.Test;
|
||||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
|
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
public class InvisibleElementRemovalServiceTest extends AbstractTest {
|
public class InvisibleElementRemovalServiceTest extends AbstractTest {
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
private InvisibleElementRemovalService invisibleElementRemovalService;
|
private InvisibleElementRemovalService invisibleElementRemovalService;
|
||||||
|
|
||||||
@MockBean
|
|
||||||
protected RabbitTemplate rabbitTemplate;
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@ -36,27 +30,18 @@ public class InvisibleElementRemovalServiceTest extends AbstractTest {
|
|||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||||
|
|
||||||
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||||
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,false);
|
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_delta.pdf")) {
|
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_delta.pdf")) {
|
||||||
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,true);
|
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
System.out.println("Output File without invisible elements: files/" + fileName + ".pdf");
|
System.out.println("Output File without invisible elements: files/" + fileName + ".pdf");
|
||||||
System.out.println("Output Delta File without invisible elements: files/" + fileName + "_delta.pdf");
|
System.out.println("Output Delta File without invisible elements: files/" + fileName + "_delta.pdf");
|
||||||
TextExtractor extractor = new TextExtractor();
|
|
||||||
|
|
||||||
PDFDoc pdfDoc;
|
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||||
try(var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
String[] text = extractAllTextFromDocument(fileStream).split("\n");
|
||||||
pdfDoc = new PDFDoc(fileStream);
|
|
||||||
}
|
|
||||||
|
|
||||||
PageIterator iterator = pdfDoc.getPageIterator();
|
|
||||||
while (iterator.hasNext()) {
|
|
||||||
Page page = iterator.next();
|
|
||||||
extractor.begin(page);
|
|
||||||
String[] text = extractor.getAsText().split("\n");
|
|
||||||
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
|
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,86 @@
|
|||||||
|
package com.iqser.red.service.ocr.v1.server.utils;
|
||||||
|
|
||||||
|
import com.pdftron.common.PDFNetException;
|
||||||
|
import com.pdftron.pdf.*;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class PdfUtils {
|
||||||
|
public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
|
||||||
|
|
||||||
|
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||||
|
TextExtractor extractor = new TextExtractor();
|
||||||
|
List<String> texts = new ArrayList<>();
|
||||||
|
|
||||||
|
PageIterator iterator = pdfDoc.getPageIterator();
|
||||||
|
while (iterator.hasNext()) {
|
||||||
|
Page page = iterator.next();
|
||||||
|
extractor.begin(page);
|
||||||
|
texts.add(extractor.getAsText());
|
||||||
|
}
|
||||||
|
|
||||||
|
extractor.destroy();
|
||||||
|
pdfDoc.close();
|
||||||
|
return String.join("\n", texts);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public static void drawGrid(ElementWriter writer, Page page) {
|
||||||
|
|
||||||
|
ElementBuilder eb = new ElementBuilder();
|
||||||
|
double dX = 15;
|
||||||
|
double dY = 15;
|
||||||
|
int nRows = (int) (page.getPageHeight() / dY) + 1;
|
||||||
|
int nCols = (int) (page.getPageWidth() / dX) + 1;
|
||||||
|
for (int row = 0; row < nRows; ++row) {
|
||||||
|
for (int col = 0; col < nCols; ++col) {
|
||||||
|
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
|
||||||
|
cell.setPathStroke(true);
|
||||||
|
cell.getGState().setLineWidth(1);
|
||||||
|
cell.getGState().setStrokeOpacity(0.1);
|
||||||
|
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||||
|
if (row == 0 && col == 0) {
|
||||||
|
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
|
||||||
|
cell.setPathFill(true);
|
||||||
|
cell.getGState().setFillOpacity(0.8);
|
||||||
|
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||||
|
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
|
||||||
|
} else {
|
||||||
|
cell.setPathFill(false);
|
||||||
|
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
|
||||||
|
}
|
||||||
|
writer.writePlacedElement(cell);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
eb.destroy();
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection) {
|
||||||
|
|
||||||
|
ColorPt colorPt = new ColorPt(1, 0, 0);
|
||||||
|
ElementBuilder eb = new ElementBuilder();
|
||||||
|
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
|
||||||
|
Rect r = rectCollection.getRectAt(i);
|
||||||
|
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
|
||||||
|
|
||||||
|
rect.setPathStroke(true);
|
||||||
|
rect.getGState().setLineWidth(5);
|
||||||
|
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||||
|
rect.getGState().setStrokeColor(colorPt);
|
||||||
|
|
||||||
|
rect.setPathFill(true);
|
||||||
|
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||||
|
rect.getGState().setFillColor(colorPt);
|
||||||
|
rect.getGState().setFillOpacity(0.5);
|
||||||
|
|
||||||
|
writer.writePlacedElement(rect);
|
||||||
|
}
|
||||||
|
colorPt.destroy();
|
||||||
|
eb.destroy();
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user