Fixed 'Comparison method violates its general contract' by using QuickSort from PDFBox

This commit is contained in:
Dominique Eifländer 2020-12-22 16:02:02 +01:00
parent 7dbc735b16
commit 000b145e71
5 changed files with 179 additions and 25 deletions

View File

@ -5,15 +5,17 @@ import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class ClassificationService {
@ -28,7 +30,7 @@ public class ClassificationService {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
System.out.println(document.getFontSizeCounter().getCountPerValue());
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (Page page : document.getPages()) {
Rectangle btf = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
@ -39,6 +41,7 @@ public class ClassificationService {
public void classifyPage(Rectangle bodyTextFrame, Page page, Document document, List<Float> headlineFontSizes) {
for (AbstractTextContainer textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextBlock) {
classifyBlock((TextBlock) textBlock, bodyTextFrame, page, document, headlineFontSizes);
@ -47,24 +50,33 @@ public class ClassificationService {
}
public void classifyBlock(TextBlock textBlock, Rectangle bodyTextFrame, Page page, Document document, List<Float> headlineFontSizes) {
public void classifyBlock(TextBlock textBlock, Rectangle bodyTextFrame, Page page, Document document,
List<Float> headlineFontSizes) {
if (document.getFontSizeCounter().getMostPopular() == null) {
// TODO Figure out why this happens.
return;
}
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.isRotated()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.isRotated()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification("Header");
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification("Footer");
} else if (page.getPageNumber() == 1
&& (!PositionUtils.isTouchingUnderBodyTextFrame(bodyTextFrame, textBlock)
&& PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
} else if (page.getPageNumber() == 1 && (!PositionUtils.isTouchingUnderBodyTextFrame(bodyTextFrame, textBlock) && PositionUtils
.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter()
.getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter()
.getMostPopular() || page.getTextBlocks().size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification("Title");
}
}
else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && textBlock.getMostPopularWordStyle().equals("bold")) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() > document
.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && textBlock.getMostPopularWordStyle()
.equals("bold")) {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
@ -72,20 +84,34 @@ public class ClassificationService {
document.setHeadlines(true);
}
}
}else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
} else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText()
.startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
.equals("bold") && !document.getFontStyleCounter()
.getMostPopular()
.equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification("H " + (headlineFontSizes.size() + 1));
document.setHeadlines(true);
}
else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document
.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle()
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification("TextBlock Bold");
}
else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter()
.getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular()) {
textBlock.setClassification("TextBlock");
}
else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter().getMostPopular().equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document
.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle()
.equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification("TextBlock Italic");
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)){
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification("TextBlock Unknown");
}
}
}

View File

@ -27,7 +27,9 @@ import com.iqser.red.service.redaction.v1.server.visualization.service.Annotatio
import com.iqser.red.service.redaction.v1.server.visualization.service.PdfVisualisationService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RestController
@RequiredArgsConstructor
public class RedactionController implements RedactionResource {
@ -46,10 +48,15 @@ public class RedactionController implements RedactionResource {
pdDocument.setAllSecurityToBeRemoved(true);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
log.info("Document structure analysis successful, starting redaction analysis...");
entityRedactionService.processDocument(classifiedDoc, redactionRequest.getManualRedactions());
annotationHighlightService.highlight(pdDocument, classifiedDoc, redactionRequest.isFlatRedaction(), redactionRequest
.getManualRedactions());
log.info("Redaction analysis successful...");
return convert(pdDocument, classifiedDoc.getPages()
.size(), classifiedDoc.getRedactionLogEntities(), classifiedDoc.getSectionGrid(), classifiedDoc.getDictionaryVersion(), classifiedDoc.getRulesVersion());

View File

@ -0,0 +1,111 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
import java.util.ArrayDeque;
import java.util.Comparator;
import java.util.Deque;
import java.util.List;
/**
* Copied and minimal modified from PDFBox.
*/
public final class QuickSort {
private QuickSort() {
}
private static final Comparator<? extends Comparable> OBJCOMP = new Comparator<Comparable>() {
@Override
public int compare(Comparable object1, Comparable object2) {
return object1.compareTo(object2);
}
};
/**
* Sorts the given list using the given comparator.
*
* @param <T> type of the objects to be sorted.
* @param list list to be sorted
* @param cmp comparator used to compare the objects within the list
*/
public static <T> void sort(List<T> list, Comparator<? super T> cmp) {
int size = list.size();
if (size < 2) {
return;
}
quicksort(list, cmp);
}
/**
* Sorts the given list using compareTo as comparator.
*
* @param <T> type of the objects to be sorted.
* @param list list to be sorted
*/
public static <T extends Comparable> void sort(List<T> list) {
sort(list, (Comparator<T>) OBJCOMP);
}
private static <T> void quicksort(List<T> list, Comparator<? super T> cmp) {
Deque<Integer> stack = new ArrayDeque<Integer>();
stack.push(0);
stack.push(list.size());
while (!stack.isEmpty()) {
int right = stack.pop();
int left = stack.pop();
if (right - left < 2) {
continue;
}
int p = left + ((right - left) / 2);
p = partition(list, cmp, p, left, right);
stack.push(p + 1);
stack.push(right);
stack.push(left);
stack.push(p);
}
}
private static <T> int partition(List<T> list, Comparator<? super T> cmp, int p, int start, int end) {
int l = start;
int h = end - 2;
T piv = list.get(p);
swap(list, p, end - 1);
while (l < h) {
if (cmp.compare(list.get(l), piv) <= 0) {
l++;
} else if (cmp.compare(piv, list.get(h)) <= 0) {
h--;
} else {
swap(list, l, h);
}
}
int idx = h;
if (cmp.compare(list.get(h), piv) < 0) {
idx++;
}
swap(list, end - 1, idx);
return idx;
}
private static <T> void swap(List<T> list, int i, int j) {
T tmp = list.get(i);
list.set(i, list.get(j));
list.set(j, tmp);
}
}

View File

@ -1,7 +1,6 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
import java.math.BigDecimal;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
@ -13,23 +12,29 @@ public class Utils {
private final static float EPSILON = 0.1f;
public static boolean feq(double f1, double f2) {
return (Math.abs(f1 - f2) < EPSILON);
}
public static float round(double d, int decimalPlace) {
BigDecimal bd = BigDecimal.valueOf(d);
bd = bd.setScale(decimalPlace, BigDecimal.ROUND_HALF_UP);
return bd.floatValue();
}
public static <T> void sort(List<T> list, Comparator<? super T> comparator) {
public static <T> void sort(List<T> list, Comparator<? super T> comparator) {
try {
Collections.sort(list, comparator);
} catch (IllegalArgumentException e){
//TODO Figure out why this happens.
QuickSort.sort(list, comparator);
} catch (IllegalArgumentException e) {
// This should not happen since we use QuickSort from PDFBox
log.warn(e.getMessage());
}
}
}
}

View File

@ -331,6 +331,7 @@ public class RedactionIntegrationTest {
@Test
public void noExceptionShouldBeThrownForAnyFiles() throws IOException {
long start = System.currentTimeMillis();
System.out.println("noExceptionShouldBeThrownForAnyFiles");
ClassLoader loader = getClass().getClassLoader();
URL url = loader.getResource("files");
@ -356,6 +357,10 @@ public class RedactionIntegrationTest {
});
}
long end = System.currentTimeMillis();
System.out.println("duration: " + (end - start));
}
@ -382,7 +387,7 @@ public class RedactionIntegrationTest {
System.out.println("redactionTest");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))