Merge branch 'DM-307-2' into 'master'
DM-307: Enabled to configure custom blockification for DocuMine Closes DM-307 See merge request redactmanager/redaction-service!20
This commit is contained in:
commit
0fa9f01273
@ -19,260 +19,7 @@ import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.mo
|
|||||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil;
|
||||||
|
|
||||||
@Service
|
public interface BlockificationService {
|
||||||
@SuppressWarnings("all")
|
|
||||||
public class BlockificationService {
|
|
||||||
|
|
||||||
static final float THRESHOLD = 1f;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
|
||||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
|
||||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
|
||||||
*
|
|
||||||
* @param textPositions The words of a page.
|
|
||||||
* @param horizontalRulingLines Horizontal table lines.
|
|
||||||
* @param verticalRulingLines Vertical table lines.
|
|
||||||
* @return Page object that contains the Textblock and text statistics.
|
|
||||||
*/
|
|
||||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
|
||||||
|
|
||||||
int indexOnPage = 0;
|
|
||||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
|
||||||
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
|
||||||
|
|
||||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
|
||||||
TextPositionSequence prev = null;
|
|
||||||
|
|
||||||
boolean wasSplitted = false;
|
|
||||||
Float splitX1 = null;
|
|
||||||
for (TextPositionSequence word : textPositions) {
|
|
||||||
|
|
||||||
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
|
|
||||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
|
||||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
|
||||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
|
||||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
|
||||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
|
||||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
|
||||||
|
|
||||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
|
||||||
|
|
||||||
Orientation prevOrientation = null;
|
|
||||||
if (!chunkBlockList.isEmpty()) {
|
|
||||||
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
|
|
||||||
}
|
|
||||||
|
|
||||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
|
||||||
indexOnPage++;
|
|
||||||
|
|
||||||
chunkBlockList.add(cb1);
|
|
||||||
chunkWords = new ArrayList<>();
|
|
||||||
|
|
||||||
if (splitByX && !isSplitByRuling) {
|
|
||||||
wasSplitted = true;
|
|
||||||
cb1.setOrientation(Orientation.LEFT);
|
|
||||||
splitX1 = word.getMinXDirAdj();
|
|
||||||
} else if (newLineAfterSplit && !isSplitByRuling) {
|
|
||||||
wasSplitted = false;
|
|
||||||
cb1.setOrientation(Orientation.RIGHT);
|
|
||||||
splitX1 = null;
|
|
||||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
|
||||||
cb1.setOrientation(Orientation.LEFT);
|
|
||||||
}
|
|
||||||
|
|
||||||
minX = 1000;
|
|
||||||
maxX = 0;
|
|
||||||
minY = 1000;
|
|
||||||
maxY = 0;
|
|
||||||
prev = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
chunkWords.add(word);
|
|
||||||
|
|
||||||
prev = word;
|
|
||||||
if (word.getMinXDirAdj() < minX) {
|
|
||||||
minX = word.getMinXDirAdj();
|
|
||||||
}
|
|
||||||
if (word.getMaxXDirAdj() > maxX) {
|
|
||||||
maxX = word.getMaxXDirAdj();
|
|
||||||
}
|
|
||||||
if (word.getMinYDirAdj() < minY) {
|
|
||||||
minY = word.getMinYDirAdj();
|
|
||||||
}
|
|
||||||
if (word.getMaxYDirAdj() > maxY) {
|
|
||||||
maxY = word.getMaxYDirAdj();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
|
||||||
if (cb1 != null) {
|
|
||||||
chunkBlockList.add(cb1);
|
|
||||||
}
|
|
||||||
|
|
||||||
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
|
|
||||||
|
|
||||||
TextPageBlock previousLeft = null;
|
|
||||||
TextPageBlock previousRight = null;
|
|
||||||
while (itty.hasNext()) {
|
|
||||||
TextPageBlock block = (TextPageBlock) itty.next();
|
|
||||||
|
|
||||||
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
|
|
||||||
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
|
|
||||||
previousLeft.add(block);
|
|
||||||
itty.remove();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
|
|
||||||
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
|
|
||||||
previousRight.add(block);
|
|
||||||
itty.remove();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (block.getOrientation().equals(Orientation.LEFT)) {
|
|
||||||
previousLeft = block;
|
|
||||||
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
|
|
||||||
previousRight = block;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
itty = chunkBlockList.iterator();
|
|
||||||
TextPageBlock previous = null;
|
|
||||||
while (itty.hasNext()) {
|
|
||||||
TextPageBlock block = (TextPageBlock) itty.next();
|
|
||||||
|
|
||||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
|
||||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
|
||||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
|
||||||
previous.add(block);
|
|
||||||
itty.remove();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
previous = block;
|
|
||||||
}
|
|
||||||
|
|
||||||
return new ClassificationPage(chunkBlockList);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean equalsWithThreshold(float f1, float f2) {
|
|
||||||
|
|
||||||
return Math.abs(f1 - f2) < THRESHOLD;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
|
||||||
|
|
||||||
TextPageBlock textBlock = null;
|
|
||||||
|
|
||||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
|
||||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
|
||||||
|
|
||||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
|
||||||
|
|
||||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
|
||||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
|
||||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
|
||||||
fontFrequencyCounter.add(wordBlock.getFont());
|
|
||||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
|
||||||
|
|
||||||
if (textBlock == null) {
|
|
||||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
|
||||||
wordBlock.getMaxXDirAdj(),
|
|
||||||
wordBlock.getMinYDirAdj(),
|
|
||||||
wordBlock.getMaxYDirAdj(),
|
|
||||||
wordBlockList,
|
|
||||||
wordBlock.getRotation());
|
|
||||||
} else {
|
|
||||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
|
||||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (textBlock != null) {
|
|
||||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
|
||||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
|
||||||
}
|
|
||||||
return textBlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isSplitByRuling(float minX,
|
|
||||||
float minY,
|
|
||||||
float maxX,
|
|
||||||
float maxY,
|
|
||||||
TextPositionSequence word,
|
|
||||||
List<Ruling> horizontalRulingLines,
|
|
||||||
List<Ruling> verticalRulingLines) {
|
|
||||||
|
|
||||||
return isSplitByRuling(maxX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMinYDirAdj(),
|
|
||||||
verticalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()) //
|
|
||||||
|| isSplitByRuling(minX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMaxYDirAdj(),
|
|
||||||
horizontalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()) //
|
|
||||||
|| isSplitByRuling(maxX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMinYDirAdj(),
|
|
||||||
horizontalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()) //
|
|
||||||
|| isSplitByRuling(minX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMaxYDirAdj(),
|
|
||||||
verticalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
|
||||||
|
|
||||||
for (Ruling ruling : rulingLines) {
|
|
||||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
|
||||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private double round(float value, int decimalPoints) {
|
|
||||||
|
|
||||||
var d = Math.pow(10, decimalPoints);
|
|
||||||
return Math.round(value * d) / d;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,198 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||||
|
|
||||||
|
import static java.util.stream.Collectors.toSet;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.Orientation;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@ConditionalOnProperty(prefix = "application", name = "type", havingValue = "DocuMine")
|
||||||
|
@SuppressWarnings("all")
|
||||||
|
public class DocuMineBlockificationService implements BlockificationService{
|
||||||
|
|
||||||
|
static final float THRESHOLD = 1f;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||||
|
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||||
|
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||||
|
* @param textPositions The words of a page.
|
||||||
|
* @param horizontalRulingLines Horizontal table lines.
|
||||||
|
* @param verticalRulingLines Vertical table lines.
|
||||||
|
* @return Page object that contains the Textblock and text statistics.
|
||||||
|
*/
|
||||||
|
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
|
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||||
|
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
|
||||||
|
|
||||||
|
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||||
|
TextPositionSequence prev = null;
|
||||||
|
|
||||||
|
boolean wasSplitted = false;
|
||||||
|
Float splitX1 = null;
|
||||||
|
for (TextPositionSequence word : textPositions) {
|
||||||
|
|
||||||
|
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
|
||||||
|
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||||
|
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||||
|
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||||
|
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||||
|
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||||
|
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||||
|
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||||
|
|
||||||
|
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY)) {
|
||||||
|
|
||||||
|
Orientation prevOrientation = null;
|
||||||
|
if (!chunkBlockList1.isEmpty()) {
|
||||||
|
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
|
||||||
|
}
|
||||||
|
|
||||||
|
TextPageBlock cb1 = buildTextBlock(chunkWords);
|
||||||
|
chunkBlockList1.add(cb1);
|
||||||
|
chunkWords = new ArrayList<>();
|
||||||
|
|
||||||
|
if (splitByX && !isSplitByRuling) {
|
||||||
|
wasSplitted = true;
|
||||||
|
cb1.setOrientation(Orientation.LEFT);
|
||||||
|
splitX1 = word.getMinXDirAdj();
|
||||||
|
} else if (newLineAfterSplit && !isSplitByRuling) {
|
||||||
|
wasSplitted = false;
|
||||||
|
cb1.setOrientation(Orientation.RIGHT);
|
||||||
|
splitX1 = null;
|
||||||
|
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||||
|
cb1.setOrientation(Orientation.LEFT);
|
||||||
|
}
|
||||||
|
|
||||||
|
minX = 1000;
|
||||||
|
maxX = 0;
|
||||||
|
minY = 1000;
|
||||||
|
maxY = 0;
|
||||||
|
prev = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
chunkWords.add(word);
|
||||||
|
|
||||||
|
prev = word;
|
||||||
|
if (word.getMinXDirAdj() < minX) {
|
||||||
|
minX = word.getMinXDirAdj();
|
||||||
|
}
|
||||||
|
if (word.getMaxXDirAdj() > maxX) {
|
||||||
|
maxX = word.getMaxXDirAdj();
|
||||||
|
}
|
||||||
|
if (word.getMinYDirAdj() < minY) {
|
||||||
|
minY = word.getMinYDirAdj();
|
||||||
|
}
|
||||||
|
if (word.getMaxYDirAdj() > maxY) {
|
||||||
|
maxY = word.getMaxYDirAdj();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TextPageBlock cb1 = buildTextBlock(chunkWords);
|
||||||
|
if (cb1 != null) {
|
||||||
|
chunkBlockList1.add(cb1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ClassificationPage(chunkBlockList1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean equalsWithThreshold(float f1, float f2) {
|
||||||
|
|
||||||
|
return Math.abs(f1 - f2) < THRESHOLD;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
|
||||||
|
|
||||||
|
TextPageBlock textBlock = null;
|
||||||
|
|
||||||
|
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||||
|
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||||
|
|
||||||
|
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||||
|
|
||||||
|
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||||
|
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||||
|
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||||
|
fontFrequencyCounter.add(wordBlock.getFont());
|
||||||
|
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||||
|
|
||||||
|
if (textBlock == null) {
|
||||||
|
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), wordBlock.getMaxXDirAdj(), wordBlock.getMinYDirAdj(), wordBlock.getMaxYDirAdj(), wordBlockList, wordBlock.getRotation());
|
||||||
|
} else {
|
||||||
|
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||||
|
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (textBlock != null) {
|
||||||
|
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||||
|
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||||
|
}
|
||||||
|
return textBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isSplitByRuling(float minX,
|
||||||
|
float minY,
|
||||||
|
float maxX,
|
||||||
|
float maxY,
|
||||||
|
TextPositionSequence word,
|
||||||
|
List<Ruling> horizontalRulingLines,
|
||||||
|
List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
|
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|
||||||
|
|| isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|
||||||
|
|| isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|
||||||
|
|| isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()); //
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||||
|
|
||||||
|
for (Ruling ruling : rulingLines) {
|
||||||
|
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||||
|
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double round(float value, int decimalPoints) {
|
||||||
|
|
||||||
|
var d = Math.pow(10, decimalPoints);
|
||||||
|
return Math.round(value * d) / d;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,280 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||||
|
|
||||||
|
import static java.util.stream.Collectors.toSet;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.Orientation;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@SuppressWarnings("all")
|
||||||
|
@ConditionalOnProperty(prefix = "application", name = "type", havingValue = "RedactManager")
|
||||||
|
public class RedactManagerBlockificationService implements BlockificationService{
|
||||||
|
|
||||||
|
static final float THRESHOLD = 1f;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||||
|
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||||
|
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||||
|
*
|
||||||
|
* @param textPositions The words of a page.
|
||||||
|
* @param horizontalRulingLines Horizontal table lines.
|
||||||
|
* @param verticalRulingLines Vertical table lines.
|
||||||
|
* @return Page object that contains the Textblock and text statistics.
|
||||||
|
*/
|
||||||
|
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
|
int indexOnPage = 0;
|
||||||
|
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||||
|
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
||||||
|
|
||||||
|
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||||
|
TextPositionSequence prev = null;
|
||||||
|
|
||||||
|
boolean wasSplitted = false;
|
||||||
|
Float splitX1 = null;
|
||||||
|
for (TextPositionSequence word : textPositions) {
|
||||||
|
|
||||||
|
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
|
||||||
|
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||||
|
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||||
|
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||||
|
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||||
|
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||||
|
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||||
|
|
||||||
|
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||||
|
|
||||||
|
Orientation prevOrientation = null;
|
||||||
|
if (!chunkBlockList.isEmpty()) {
|
||||||
|
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
|
||||||
|
}
|
||||||
|
|
||||||
|
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||||
|
indexOnPage++;
|
||||||
|
|
||||||
|
chunkBlockList.add(cb1);
|
||||||
|
chunkWords = new ArrayList<>();
|
||||||
|
|
||||||
|
if (splitByX && !isSplitByRuling) {
|
||||||
|
wasSplitted = true;
|
||||||
|
cb1.setOrientation(Orientation.LEFT);
|
||||||
|
splitX1 = word.getMinXDirAdj();
|
||||||
|
} else if (newLineAfterSplit && !isSplitByRuling) {
|
||||||
|
wasSplitted = false;
|
||||||
|
cb1.setOrientation(Orientation.RIGHT);
|
||||||
|
splitX1 = null;
|
||||||
|
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||||
|
cb1.setOrientation(Orientation.LEFT);
|
||||||
|
}
|
||||||
|
|
||||||
|
minX = 1000;
|
||||||
|
maxX = 0;
|
||||||
|
minY = 1000;
|
||||||
|
maxY = 0;
|
||||||
|
prev = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
chunkWords.add(word);
|
||||||
|
|
||||||
|
prev = word;
|
||||||
|
if (word.getMinXDirAdj() < minX) {
|
||||||
|
minX = word.getMinXDirAdj();
|
||||||
|
}
|
||||||
|
if (word.getMaxXDirAdj() > maxX) {
|
||||||
|
maxX = word.getMaxXDirAdj();
|
||||||
|
}
|
||||||
|
if (word.getMinYDirAdj() < minY) {
|
||||||
|
minY = word.getMinYDirAdj();
|
||||||
|
}
|
||||||
|
if (word.getMaxYDirAdj() > maxY) {
|
||||||
|
maxY = word.getMaxYDirAdj();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||||
|
if (cb1 != null) {
|
||||||
|
chunkBlockList.add(cb1);
|
||||||
|
}
|
||||||
|
|
||||||
|
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
|
||||||
|
|
||||||
|
TextPageBlock previousLeft = null;
|
||||||
|
TextPageBlock previousRight = null;
|
||||||
|
while (itty.hasNext()) {
|
||||||
|
TextPageBlock block = (TextPageBlock) itty.next();
|
||||||
|
|
||||||
|
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
|
||||||
|
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
|
||||||
|
previousLeft.add(block);
|
||||||
|
itty.remove();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
|
||||||
|
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
|
||||||
|
previousRight.add(block);
|
||||||
|
itty.remove();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (block.getOrientation().equals(Orientation.LEFT)) {
|
||||||
|
previousLeft = block;
|
||||||
|
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
|
||||||
|
previousRight = block;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
itty = chunkBlockList.iterator();
|
||||||
|
TextPageBlock previous = null;
|
||||||
|
while (itty.hasNext()) {
|
||||||
|
TextPageBlock block = (TextPageBlock) itty.next();
|
||||||
|
|
||||||
|
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||||
|
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||||
|
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||||
|
previous.add(block);
|
||||||
|
itty.remove();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
previous = block;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ClassificationPage(chunkBlockList);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean equalsWithThreshold(float f1, float f2) {
|
||||||
|
|
||||||
|
return Math.abs(f1 - f2) < THRESHOLD;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||||
|
|
||||||
|
TextPageBlock textBlock = null;
|
||||||
|
|
||||||
|
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||||
|
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||||
|
|
||||||
|
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||||
|
|
||||||
|
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||||
|
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||||
|
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||||
|
fontFrequencyCounter.add(wordBlock.getFont());
|
||||||
|
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||||
|
|
||||||
|
if (textBlock == null) {
|
||||||
|
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||||
|
wordBlock.getMaxXDirAdj(),
|
||||||
|
wordBlock.getMinYDirAdj(),
|
||||||
|
wordBlock.getMaxYDirAdj(),
|
||||||
|
wordBlockList,
|
||||||
|
wordBlock.getRotation());
|
||||||
|
} else {
|
||||||
|
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||||
|
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (textBlock != null) {
|
||||||
|
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||||
|
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||||
|
}
|
||||||
|
return textBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isSplitByRuling(float minX,
|
||||||
|
float minY,
|
||||||
|
float maxX,
|
||||||
|
float maxY,
|
||||||
|
TextPositionSequence word,
|
||||||
|
List<Ruling> horizontalRulingLines,
|
||||||
|
List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
|
return isSplitByRuling(maxX,
|
||||||
|
minY,
|
||||||
|
word.getMinXDirAdj(),
|
||||||
|
word.getMinYDirAdj(),
|
||||||
|
verticalRulingLines,
|
||||||
|
word.getDir().getDegrees(),
|
||||||
|
word.getPageWidth(),
|
||||||
|
word.getPageHeight()) //
|
||||||
|
|| isSplitByRuling(minX,
|
||||||
|
minY,
|
||||||
|
word.getMinXDirAdj(),
|
||||||
|
word.getMaxYDirAdj(),
|
||||||
|
horizontalRulingLines,
|
||||||
|
word.getDir().getDegrees(),
|
||||||
|
word.getPageWidth(),
|
||||||
|
word.getPageHeight()) //
|
||||||
|
|| isSplitByRuling(maxX,
|
||||||
|
minY,
|
||||||
|
word.getMinXDirAdj(),
|
||||||
|
word.getMinYDirAdj(),
|
||||||
|
horizontalRulingLines,
|
||||||
|
word.getDir().getDegrees(),
|
||||||
|
word.getPageWidth(),
|
||||||
|
word.getPageHeight()) //
|
||||||
|
|| isSplitByRuling(minX,
|
||||||
|
minY,
|
||||||
|
word.getMinXDirAdj(),
|
||||||
|
word.getMaxYDirAdj(),
|
||||||
|
verticalRulingLines,
|
||||||
|
word.getDir().getDegrees(),
|
||||||
|
word.getPageWidth(),
|
||||||
|
word.getPageHeight());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||||
|
|
||||||
|
for (Ruling ruling : rulingLines) {
|
||||||
|
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||||
|
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double round(float value, int decimalPoints) {
|
||||||
|
|
||||||
|
var d = Math.pow(10, decimalPoints);
|
||||||
|
return Math.round(value * d) / d;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -43,7 +43,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
|
|||||||
@Test
|
@Test
|
||||||
public void titleExtraction() throws IOException {
|
public void titleExtraction() throws IOException {
|
||||||
|
|
||||||
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A13617AV/474_G.1.2 - 1768300_MMNA_A13617AV_report.pdf");
|
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A13617AV/403_F.2 - A13617AV - Acute Inhalation Toxicity - Rats.pdf");
|
||||||
System.out.println("Start Full integration test");
|
System.out.println("Start Full integration test");
|
||||||
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
||||||
System.out.println("Finished structure analysis");
|
System.out.println("Finished structure analysis");
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user