Merge branch 'DM-307-2' into 'master'
DM-307: Enabled to configure custom blockification for DocuMine Closes DM-307 See merge request redactmanager/redaction-service!20
This commit is contained in:
commit
0fa9f01273
@ -19,260 +19,7 @@ import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.mo
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@Service
|
||||
@SuppressWarnings("all")
|
||||
public class BlockificationService {
|
||||
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
/**
|
||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param horizontalRulingLines Horizontal table lines.
|
||||
* @param verticalRulingLines Vertical table lines.
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Float splitX1 = null;
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
|
||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!chunkBlockList.isEmpty()) {
|
||||
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
indexOnPage++;
|
||||
|
||||
chunkBlockList.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
if (splitByX && !isSplitByRuling) {
|
||||
wasSplitted = true;
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
splitX1 = word.getMinXDirAdj();
|
||||
} else if (newLineAfterSplit && !isSplitByRuling) {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(Orientation.RIGHT);
|
||||
splitX1 = null;
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
}
|
||||
|
||||
minX = 1000;
|
||||
maxX = 0;
|
||||
minY = 1000;
|
||||
maxY = 0;
|
||||
prev = null;
|
||||
}
|
||||
|
||||
chunkWords.add(word);
|
||||
|
||||
prev = word;
|
||||
if (word.getMinXDirAdj() < minX) {
|
||||
minX = word.getMinXDirAdj();
|
||||
}
|
||||
if (word.getMaxXDirAdj() > maxX) {
|
||||
maxX = word.getMaxXDirAdj();
|
||||
}
|
||||
if (word.getMinYDirAdj() < minY) {
|
||||
minY = word.getMinYDirAdj();
|
||||
}
|
||||
if (word.getMaxYDirAdj() > maxY) {
|
||||
maxY = word.getMaxYDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList.add(cb1);
|
||||
}
|
||||
|
||||
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
|
||||
|
||||
TextPageBlock previousLeft = null;
|
||||
TextPageBlock previousRight = null;
|
||||
while (itty.hasNext()) {
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
|
||||
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
|
||||
previousLeft.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
|
||||
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
|
||||
previousRight.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (block.getOrientation().equals(Orientation.LEFT)) {
|
||||
previousLeft = block;
|
||||
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
|
||||
previousRight = block;
|
||||
}
|
||||
}
|
||||
|
||||
itty = chunkBlockList.iterator();
|
||||
TextPageBlock previous = null;
|
||||
while (itty.hasNext()) {
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||
previous.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
|
||||
previous = block;
|
||||
}
|
||||
|
||||
return new ClassificationPage(chunkBlockList);
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight());
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
return Math.round(value * d) / d;
|
||||
}
|
||||
public interface BlockificationService {
|
||||
|
||||
ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines);
|
||||
}
|
||||
|
||||
@ -0,0 +1,198 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.Orientation;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@Service
|
||||
@ConditionalOnProperty(prefix = "application", name = "type", havingValue = "DocuMine")
|
||||
@SuppressWarnings("all")
|
||||
public class DocuMineBlockificationService implements BlockificationService{
|
||||
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
/**
|
||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
* @param textPositions The words of a page.
|
||||
* @param horizontalRulingLines Horizontal table lines.
|
||||
* @param verticalRulingLines Vertical table lines.
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Float splitX1 = null;
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
|
||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY)) {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!chunkBlockList1.isEmpty()) {
|
||||
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords);
|
||||
chunkBlockList1.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
if (splitByX && !isSplitByRuling) {
|
||||
wasSplitted = true;
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
splitX1 = word.getMinXDirAdj();
|
||||
} else if (newLineAfterSplit && !isSplitByRuling) {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(Orientation.RIGHT);
|
||||
splitX1 = null;
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
}
|
||||
|
||||
minX = 1000;
|
||||
maxX = 0;
|
||||
minY = 1000;
|
||||
maxY = 0;
|
||||
prev = null;
|
||||
}
|
||||
|
||||
chunkWords.add(word);
|
||||
|
||||
prev = word;
|
||||
if (word.getMinXDirAdj() < minX) {
|
||||
minX = word.getMinXDirAdj();
|
||||
}
|
||||
if (word.getMaxXDirAdj() > maxX) {
|
||||
maxX = word.getMaxXDirAdj();
|
||||
}
|
||||
if (word.getMinYDirAdj() < minY) {
|
||||
minY = word.getMinYDirAdj();
|
||||
}
|
||||
if (word.getMaxYDirAdj() > maxY) {
|
||||
maxY = word.getMaxYDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList1.add(cb1);
|
||||
}
|
||||
|
||||
return new ClassificationPage(chunkBlockList1);
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), wordBlock.getMaxXDirAdj(), wordBlock.getMinYDirAdj(), wordBlock.getMaxYDirAdj(), wordBlockList, wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()); //
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
return Math.round(value * d) / d;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,280 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.Orientation;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@Service
|
||||
@SuppressWarnings("all")
|
||||
@ConditionalOnProperty(prefix = "application", name = "type", havingValue = "RedactManager")
|
||||
public class RedactManagerBlockificationService implements BlockificationService{
|
||||
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
/**
|
||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param horizontalRulingLines Horizontal table lines.
|
||||
* @param verticalRulingLines Vertical table lines.
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Float splitX1 = null;
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
|
||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!chunkBlockList.isEmpty()) {
|
||||
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
indexOnPage++;
|
||||
|
||||
chunkBlockList.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
if (splitByX && !isSplitByRuling) {
|
||||
wasSplitted = true;
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
splitX1 = word.getMinXDirAdj();
|
||||
} else if (newLineAfterSplit && !isSplitByRuling) {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(Orientation.RIGHT);
|
||||
splitX1 = null;
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
}
|
||||
|
||||
minX = 1000;
|
||||
maxX = 0;
|
||||
minY = 1000;
|
||||
maxY = 0;
|
||||
prev = null;
|
||||
}
|
||||
|
||||
chunkWords.add(word);
|
||||
|
||||
prev = word;
|
||||
if (word.getMinXDirAdj() < minX) {
|
||||
minX = word.getMinXDirAdj();
|
||||
}
|
||||
if (word.getMaxXDirAdj() > maxX) {
|
||||
maxX = word.getMaxXDirAdj();
|
||||
}
|
||||
if (word.getMinYDirAdj() < minY) {
|
||||
minY = word.getMinYDirAdj();
|
||||
}
|
||||
if (word.getMaxYDirAdj() > maxY) {
|
||||
maxY = word.getMaxYDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList.add(cb1);
|
||||
}
|
||||
|
||||
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
|
||||
|
||||
TextPageBlock previousLeft = null;
|
||||
TextPageBlock previousRight = null;
|
||||
while (itty.hasNext()) {
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
|
||||
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
|
||||
previousLeft.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
|
||||
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
|
||||
previousRight.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (block.getOrientation().equals(Orientation.LEFT)) {
|
||||
previousLeft = block;
|
||||
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
|
||||
previousRight = block;
|
||||
}
|
||||
}
|
||||
|
||||
itty = chunkBlockList.iterator();
|
||||
TextPageBlock previous = null;
|
||||
while (itty.hasNext()) {
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||
previous.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
|
||||
previous = block;
|
||||
}
|
||||
|
||||
return new ClassificationPage(chunkBlockList);
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight());
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
return Math.round(value * d) / d;
|
||||
}
|
||||
|
||||
}
|
||||
@ -43,7 +43,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
|
||||
@Test
|
||||
public void titleExtraction() throws IOException {
|
||||
|
||||
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A13617AV/474_G.1.2 - 1768300_MMNA_A13617AV_report.pdf");
|
||||
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A13617AV/403_F.2 - A13617AV - Acute Inhalation Toxicity - Rats.pdf");
|
||||
System.out.println("Start Full integration test");
|
||||
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
||||
System.out.println("Finished structure analysis");
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user