diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java index 8de040e6..337b9182 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java @@ -5,15 +5,17 @@ import java.util.regex.Pattern; import org.springframework.stereotype.Service; -import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle; import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +@Slf4j @Service @RequiredArgsConstructor public class ClassificationService { @@ -28,7 +30,7 @@ public class ClassificationService { List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); - System.out.println(document.getFontSizeCounter().getCountPerValue()); + log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); for (Page page : document.getPages()) { Rectangle btf = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame; @@ -39,6 +41,7 @@ public class ClassificationService { public void classifyPage(Rectangle bodyTextFrame, Page page, Document document, List headlineFontSizes) { + for (AbstractTextContainer textBlock : page.getTextBlocks()) { if (textBlock instanceof TextBlock) { classifyBlock((TextBlock) textBlock, bodyTextFrame, page, document, headlineFontSizes); @@ -47,24 +50,33 @@ public class ClassificationService { } - public void classifyBlock(TextBlock textBlock, Rectangle bodyTextFrame, Page page, Document document, List headlineFontSizes) { + public void classifyBlock(TextBlock textBlock, Rectangle bodyTextFrame, Page page, Document document, + List headlineFontSizes) { + if (document.getFontSizeCounter().getMostPopular() == null) { // TODO Figure out why this happens. return; } - if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.isRotated()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.isRotated()) && (document.getFontSizeCounter() + .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + .getMostPopular())) { textBlock.setClassification("Header"); - } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock) && (document.getFontSizeCounter() + .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + .getMostPopular())) { textBlock.setClassification("Footer"); - } else if (page.getPageNumber() == 1 - && (!PositionUtils.isTouchingUnderBodyTextFrame(bodyTextFrame, textBlock) - && PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { + } else if (page.getPageNumber() == 1 && (!PositionUtils.isTouchingUnderBodyTextFrame(bodyTextFrame, textBlock) && PositionUtils + .getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter() + .getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter() + .getMostPopular() || page.getTextBlocks().size() == 1)) { if (!Pattern.matches("[0-9]+", textBlock.toString())) { textBlock.setClassification("Title"); } - } - else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && textBlock.getMostPopularWordStyle().equals("bold")) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() > document + .getFontSizeCounter() + .getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && textBlock.getMostPopularWordStyle() + .equals("bold")) { for (int i = 1; i <= headlineFontSizes.size(); i++) { if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) { @@ -72,20 +84,34 @@ public class ClassificationService { document.setHeadlines(true); } } - }else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { + } else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText() + .startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle() + .equals("bold") && !document.getFontStyleCounter() + .getMostPopular() + .equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { textBlock.setClassification("H " + (headlineFontSizes.size() + 1)); document.setHeadlines(true); - } - else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document + .getFontSizeCounter() + .getMostPopular() && textBlock.getMostPopularWordStyle() + .equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { textBlock.setClassification("TextBlock Bold"); - } - else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() + .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() + .equals(document.getFontStyleCounter() + .getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() + .getMostPopular()) { textBlock.setClassification("TextBlock"); - } - else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter().getMostPopular().equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document + .getFontSizeCounter() + .getMostPopular() && textBlock.getMostPopularWordStyle() + .equals("italic") && !document.getFontStyleCounter() + .getMostPopular() + .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { textBlock.setClassification("TextBlock Italic"); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)){ + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { textBlock.setClassification("TextBlock Unknown"); } } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java index d9a197a9..69a58342 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java @@ -27,7 +27,9 @@ import com.iqser.red.service.redaction.v1.server.visualization.service.Annotatio import com.iqser.red.service.redaction.v1.server.visualization.service.PdfVisualisationService; import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +@Slf4j @RestController @RequiredArgsConstructor public class RedactionController implements RedactionResource { @@ -46,10 +48,15 @@ public class RedactionController implements RedactionResource { pdDocument.setAllSecurityToBeRemoved(true); Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); + + log.info("Document structure analysis successful, starting redaction analysis..."); + entityRedactionService.processDocument(classifiedDoc, redactionRequest.getManualRedactions()); annotationHighlightService.highlight(pdDocument, classifiedDoc, redactionRequest.isFlatRedaction(), redactionRequest .getManualRedactions()); + log.info("Redaction analysis successful..."); + return convert(pdDocument, classifiedDoc.getPages() .size(), classifiedDoc.getRedactionLogEntities(), classifiedDoc.getSectionGrid(), classifiedDoc.getDictionaryVersion(), classifiedDoc.getRulesVersion()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/QuickSort.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/QuickSort.java new file mode 100644 index 00000000..5b9c3b6c --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/QuickSort.java @@ -0,0 +1,111 @@ +package com.iqser.red.service.redaction.v1.server.tableextraction.utils; + +import java.util.ArrayDeque; +import java.util.Comparator; +import java.util.Deque; +import java.util.List; + +/** + * Copied and minimal modified from PDFBox. + */ +public final class QuickSort { + + private QuickSort() { + + } + + + private static final Comparator OBJCOMP = new Comparator() { + @Override + public int compare(Comparable object1, Comparable object2) { + + return object1.compareTo(object2); + } + }; + + + /** + * Sorts the given list using the given comparator. + * + * @param type of the objects to be sorted. + * @param list list to be sorted + * @param cmp comparator used to compare the objects within the list + */ + public static void sort(List list, Comparator cmp) { + + int size = list.size(); + if (size < 2) { + return; + } + quicksort(list, cmp); + } + + + /** + * Sorts the given list using compareTo as comparator. + * + * @param type of the objects to be sorted. + * @param list list to be sorted + */ + public static void sort(List list) { + + sort(list, (Comparator) OBJCOMP); + } + + + private static void quicksort(List list, Comparator cmp) { + + Deque stack = new ArrayDeque(); + stack.push(0); + stack.push(list.size()); + while (!stack.isEmpty()) { + int right = stack.pop(); + int left = stack.pop(); + if (right - left < 2) { + continue; + } + int p = left + ((right - left) / 2); + p = partition(list, cmp, p, left, right); + + stack.push(p + 1); + stack.push(right); + + stack.push(left); + stack.push(p); + } + } + + + private static int partition(List list, Comparator cmp, int p, int start, int end) { + + int l = start; + int h = end - 2; + T piv = list.get(p); + swap(list, p, end - 1); + + while (l < h) { + if (cmp.compare(list.get(l), piv) <= 0) { + l++; + } else if (cmp.compare(piv, list.get(h)) <= 0) { + h--; + } else { + swap(list, l, h); + } + } + int idx = h; + if (cmp.compare(list.get(h), piv) < 0) { + idx++; + } + swap(list, end - 1, idx); + return idx; + } + + + private static void swap(List list, int i, int j) { + + T tmp = list.get(i); + list.set(i, list.get(j)); + list.set(j, tmp); + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/Utils.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/Utils.java index 9713baf4..62f72434 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/Utils.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/utils/Utils.java @@ -1,7 +1,6 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.utils; import java.math.BigDecimal; -import java.util.Collections; import java.util.Comparator; import java.util.List; @@ -13,23 +12,29 @@ public class Utils { private final static float EPSILON = 0.1f; + public static boolean feq(double f1, double f2) { + return (Math.abs(f1 - f2) < EPSILON); } + public static float round(double d, int decimalPlace) { + BigDecimal bd = BigDecimal.valueOf(d); bd = bd.setScale(decimalPlace, BigDecimal.ROUND_HALF_UP); return bd.floatValue(); } - public static void sort(List list, Comparator comparator) { + + public static void sort(List list, Comparator comparator) { + try { - Collections.sort(list, comparator); - } catch (IllegalArgumentException e){ - //TODO Figure out why this happens. + QuickSort.sort(list, comparator); + } catch (IllegalArgumentException e) { + // This should not happen since we use QuickSort from PDFBox log.warn(e.getMessage()); } - } + } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index ffed68e6..87acf741 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -331,6 +331,7 @@ public class RedactionIntegrationTest { @Test public void noExceptionShouldBeThrownForAnyFiles() throws IOException { + long start = System.currentTimeMillis(); System.out.println("noExceptionShouldBeThrownForAnyFiles"); ClassLoader loader = getClass().getClassLoader(); URL url = loader.getResource("files"); @@ -356,6 +357,10 @@ public class RedactionIntegrationTest { }); } + long end = System.currentTimeMillis(); + + System.out.println("duration: " + (end - start)); + } @@ -382,7 +387,7 @@ public class RedactionIntegrationTest { System.out.println("redactionTest"); long start = System.currentTimeMillis(); - ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); RedactionRequest request = RedactionRequest.builder() .document(IOUtils.toByteArray(pdfFileResource.getInputStream()))