Fixed 'Comparison method violates its general contract' by using QuickSort from PDFBox
This commit is contained in:
parent
7dbc735b16
commit
000b145e71
@ -5,15 +5,17 @@ import java.util.regex.Pattern;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class ClassificationService {
|
||||
@ -28,7 +30,7 @@ public class ClassificationService {
|
||||
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
System.out.println(document.getFontSizeCounter().getCountPerValue());
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
for (Page page : document.getPages()) {
|
||||
Rectangle btf = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
|
||||
@ -39,6 +41,7 @@ public class ClassificationService {
|
||||
|
||||
|
||||
public void classifyPage(Rectangle bodyTextFrame, Page page, Document document, List<Float> headlineFontSizes) {
|
||||
|
||||
for (AbstractTextContainer textBlock : page.getTextBlocks()) {
|
||||
if (textBlock instanceof TextBlock) {
|
||||
classifyBlock((TextBlock) textBlock, bodyTextFrame, page, document, headlineFontSizes);
|
||||
@ -47,24 +50,33 @@ public class ClassificationService {
|
||||
}
|
||||
|
||||
|
||||
public void classifyBlock(TextBlock textBlock, Rectangle bodyTextFrame, Page page, Document document, List<Float> headlineFontSizes) {
|
||||
public void classifyBlock(TextBlock textBlock, Rectangle bodyTextFrame, Page page, Document document,
|
||||
List<Float> headlineFontSizes) {
|
||||
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
// TODO Figure out why this happens.
|
||||
return;
|
||||
}
|
||||
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.isRotated()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.isRotated()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification("Header");
|
||||
|
||||
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification("Footer");
|
||||
} else if (page.getPageNumber() == 1
|
||||
&& (!PositionUtils.isTouchingUnderBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
} else if (page.getPageNumber() == 1 && (!PositionUtils.isTouchingUnderBodyTextFrame(bodyTextFrame, textBlock) && PositionUtils
|
||||
.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter()
|
||||
.getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter()
|
||||
.getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification("Title");
|
||||
}
|
||||
}
|
||||
else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && textBlock.getMostPopularWordStyle().equals("bold")) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() > document
|
||||
.getFontSizeCounter()
|
||||
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && textBlock.getMostPopularWordStyle()
|
||||
.equals("bold")) {
|
||||
|
||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||
@ -72,20 +84,34 @@ public class ClassificationService {
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
}
|
||||
}else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
} else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText()
|
||||
.startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
|
||||
.equals("bold") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
textBlock.setClassification("H " + (headlineFontSizes.size() + 1));
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document
|
||||
.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle()
|
||||
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
textBlock.setClassification("TextBlock Bold");
|
||||
}
|
||||
else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
||||
.equals(document.getFontStyleCounter()
|
||||
.getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular()) {
|
||||
textBlock.setClassification("TextBlock");
|
||||
}
|
||||
else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter().getMostPopular().equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document
|
||||
.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle()
|
||||
.equals("italic") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
textBlock.setClassification("TextBlock Italic");
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)){
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||
textBlock.setClassification("TextBlock Unknown");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -27,7 +27,9 @@ import com.iqser.red.service.redaction.v1.server.visualization.service.Annotatio
|
||||
import com.iqser.red.service.redaction.v1.server.visualization.service.PdfVisualisationService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@RestController
|
||||
@RequiredArgsConstructor
|
||||
public class RedactionController implements RedactionResource {
|
||||
@ -46,10 +48,15 @@ public class RedactionController implements RedactionResource {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
|
||||
log.info("Document structure analysis successful, starting redaction analysis...");
|
||||
|
||||
entityRedactionService.processDocument(classifiedDoc, redactionRequest.getManualRedactions());
|
||||
annotationHighlightService.highlight(pdDocument, classifiedDoc, redactionRequest.isFlatRedaction(), redactionRequest
|
||||
.getManualRedactions());
|
||||
|
||||
log.info("Redaction analysis successful...");
|
||||
|
||||
return convert(pdDocument, classifiedDoc.getPages()
|
||||
.size(), classifiedDoc.getRedactionLogEntities(), classifiedDoc.getSectionGrid(), classifiedDoc.getDictionaryVersion(), classifiedDoc.getRulesVersion());
|
||||
|
||||
|
||||
@ -0,0 +1,111 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
|
||||
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Comparator;
|
||||
import java.util.Deque;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Copied and minimal modified from PDFBox.
|
||||
*/
|
||||
public final class QuickSort {
|
||||
|
||||
private QuickSort() {
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static final Comparator<? extends Comparable> OBJCOMP = new Comparator<Comparable>() {
|
||||
@Override
|
||||
public int compare(Comparable object1, Comparable object2) {
|
||||
|
||||
return object1.compareTo(object2);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Sorts the given list using the given comparator.
|
||||
*
|
||||
* @param <T> type of the objects to be sorted.
|
||||
* @param list list to be sorted
|
||||
* @param cmp comparator used to compare the objects within the list
|
||||
*/
|
||||
public static <T> void sort(List<T> list, Comparator<? super T> cmp) {
|
||||
|
||||
int size = list.size();
|
||||
if (size < 2) {
|
||||
return;
|
||||
}
|
||||
quicksort(list, cmp);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Sorts the given list using compareTo as comparator.
|
||||
*
|
||||
* @param <T> type of the objects to be sorted.
|
||||
* @param list list to be sorted
|
||||
*/
|
||||
public static <T extends Comparable> void sort(List<T> list) {
|
||||
|
||||
sort(list, (Comparator<T>) OBJCOMP);
|
||||
}
|
||||
|
||||
|
||||
private static <T> void quicksort(List<T> list, Comparator<? super T> cmp) {
|
||||
|
||||
Deque<Integer> stack = new ArrayDeque<Integer>();
|
||||
stack.push(0);
|
||||
stack.push(list.size());
|
||||
while (!stack.isEmpty()) {
|
||||
int right = stack.pop();
|
||||
int left = stack.pop();
|
||||
if (right - left < 2) {
|
||||
continue;
|
||||
}
|
||||
int p = left + ((right - left) / 2);
|
||||
p = partition(list, cmp, p, left, right);
|
||||
|
||||
stack.push(p + 1);
|
||||
stack.push(right);
|
||||
|
||||
stack.push(left);
|
||||
stack.push(p);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static <T> int partition(List<T> list, Comparator<? super T> cmp, int p, int start, int end) {
|
||||
|
||||
int l = start;
|
||||
int h = end - 2;
|
||||
T piv = list.get(p);
|
||||
swap(list, p, end - 1);
|
||||
|
||||
while (l < h) {
|
||||
if (cmp.compare(list.get(l), piv) <= 0) {
|
||||
l++;
|
||||
} else if (cmp.compare(piv, list.get(h)) <= 0) {
|
||||
h--;
|
||||
} else {
|
||||
swap(list, l, h);
|
||||
}
|
||||
}
|
||||
int idx = h;
|
||||
if (cmp.compare(list.get(h), piv) < 0) {
|
||||
idx++;
|
||||
}
|
||||
swap(list, end - 1, idx);
|
||||
return idx;
|
||||
}
|
||||
|
||||
|
||||
private static <T> void swap(List<T> list, int i, int j) {
|
||||
|
||||
T tmp = list.get(i);
|
||||
list.set(i, list.get(j));
|
||||
list.set(j, tmp);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,7 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
@ -13,23 +12,29 @@ public class Utils {
|
||||
|
||||
private final static float EPSILON = 0.1f;
|
||||
|
||||
|
||||
public static boolean feq(double f1, double f2) {
|
||||
|
||||
return (Math.abs(f1 - f2) < EPSILON);
|
||||
}
|
||||
|
||||
|
||||
public static float round(double d, int decimalPlace) {
|
||||
|
||||
BigDecimal bd = BigDecimal.valueOf(d);
|
||||
bd = bd.setScale(decimalPlace, BigDecimal.ROUND_HALF_UP);
|
||||
return bd.floatValue();
|
||||
}
|
||||
|
||||
public static <T> void sort(List<T> list, Comparator<? super T> comparator) {
|
||||
|
||||
public static <T> void sort(List<T> list, Comparator<? super T> comparator) {
|
||||
|
||||
try {
|
||||
Collections.sort(list, comparator);
|
||||
} catch (IllegalArgumentException e){
|
||||
//TODO Figure out why this happens.
|
||||
QuickSort.sort(list, comparator);
|
||||
} catch (IllegalArgumentException e) {
|
||||
// This should not happen since we use QuickSort from PDFBox
|
||||
log.warn(e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -331,6 +331,7 @@ public class RedactionIntegrationTest {
|
||||
@Test
|
||||
public void noExceptionShouldBeThrownForAnyFiles() throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
System.out.println("noExceptionShouldBeThrownForAnyFiles");
|
||||
ClassLoader loader = getClass().getClassLoader();
|
||||
URL url = loader.getResource("files");
|
||||
@ -356,6 +357,10 @@ public class RedactionIntegrationTest {
|
||||
});
|
||||
}
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
|
||||
System.out.println("duration: " + (end - start));
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -382,7 +387,7 @@ public class RedactionIntegrationTest {
|
||||
|
||||
System.out.println("redactionTest");
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user