RED-7074: Design Subsection section tree structure algorithm
* post rebase fixup
This commit is contained in:
parent
61c90fc30d
commit
49f13d1f03
@ -97,10 +97,10 @@ public class TOCEnrichmentService {
|
|||||||
|
|
||||||
for (ClassifiedImage image : page.getImages()) {
|
for (ClassifiedImage image : page.getImages()) {
|
||||||
|
|
||||||
Float xMin = null;
|
Double xMin = null;
|
||||||
Float yMin = null;
|
Double yMin = null;
|
||||||
Float xMax = null;
|
Double xMax = null;
|
||||||
Float yMax = null;
|
Double yMax = null;
|
||||||
|
|
||||||
for (TableOfContentItem tocItem : lastFoundTOCItems) {
|
for (TableOfContentItem tocItem : lastFoundTOCItems) {
|
||||||
var headline = tocItem.getHeadline();
|
var headline = tocItem.getHeadline();
|
||||||
@ -195,7 +195,7 @@ public class TOCEnrichmentService {
|
|||||||
.get(0)
|
.get(0)
|
||||||
.stream()
|
.stream()
|
||||||
.map(cell -> {
|
.map(cell -> {
|
||||||
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
|
Cell fakeCell = Cell.copy(cell);
|
||||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||||
return fakeCell;
|
return fakeCell;
|
||||||
})
|
})
|
||||||
|
|||||||
@ -58,17 +58,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void calculateBBox() {
|
|
||||||
|
|
||||||
if (sequences == null) {
|
|
||||||
this.bBox = new Rectangle2D.Double();
|
|
||||||
this.bBoxInitialUserSpace = new Rectangle2D.Double();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
setToBBoxOfComponents(sequences);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
public float getPageHeight() {
|
public float getPageHeight() {
|
||||||
|
|
||||||
@ -83,6 +72,17 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void calculateBBox() {
|
||||||
|
|
||||||
|
if (sequences == null) {
|
||||||
|
this.bBox = new Rectangle2D.Double();
|
||||||
|
this.bBoxInitialUserSpace = new Rectangle2D.Double();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
setToBBoxOfComponents(sequences);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||||
|
|
||||||
if (textBlocksToMerge.isEmpty()) {
|
if (textBlocksToMerge.isEmpty()) {
|
||||||
@ -105,7 +105,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private void calculateFrequencyCounters() {
|
private void calculateFrequencyCounters() {
|
||||||
|
|
||||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
|||||||
@ -237,7 +237,7 @@ public class BlockificationPostprocessingService {
|
|||||||
boolean modifiedBlockToSplit = false;
|
boolean modifiedBlockToSplit = false;
|
||||||
if (!wordSequenceResult.inSequence.isEmpty()) {
|
if (!wordSequenceResult.inSequence.isEmpty()) {
|
||||||
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
||||||
blockToSplit.resize();
|
blockToSplit.calculateBBox();
|
||||||
modifiedBlockToSplit = true;
|
modifiedBlockToSplit = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -368,7 +368,7 @@ public class BlockificationPostprocessingService {
|
|||||||
|
|
||||||
assert firstBlock != null;
|
assert firstBlock != null;
|
||||||
firstBlock.setToDuplicate(false);
|
firstBlock.setToDuplicate(false);
|
||||||
firstBlock.resize();
|
firstBlock.calculateBBox();
|
||||||
classificationPage.getTextBlocks().removeAll(mergedBlocks);
|
classificationPage.getTextBlocks().removeAll(mergedBlocks);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -101,18 +101,6 @@ public class DocstrumBlockificationService {
|
|||||||
abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0));
|
abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0));
|
||||||
});
|
});
|
||||||
|
|
||||||
if (xyOrder) {
|
|
||||||
abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
|
||||||
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
|
||||||
abstractPageBlocks.sort(new Comparator<AbstractPageBlock>() {
|
|
||||||
@Override
|
|
||||||
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
|
|
||||||
|
|
||||||
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return abstractPageBlocks;
|
return abstractPageBlocks;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -196,8 +184,7 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||||
|
|
||||||
return previous.intersectsY(current)//(Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD && Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
|
return previous.intersectsY(current) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1);
|
||||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -326,67 +313,6 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<AbstractPageBlock> splitZonesAtRulings(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
|
||||||
|
|
||||||
int indexOnPage = 0;
|
|
||||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
|
||||||
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
|
||||||
|
|
||||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
|
||||||
TextPositionSequence prev = null;
|
|
||||||
|
|
||||||
for (TextPositionSequence word : textPositions) {
|
|
||||||
|
|
||||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
|
||||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
|
||||||
|
|
||||||
if (prev != null && (splitByDir || isSplitByRuling)) {
|
|
||||||
|
|
||||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
|
||||||
indexOnPage++;
|
|
||||||
|
|
||||||
chunkBlockList.add(cb1);
|
|
||||||
chunkWords = new ArrayList<>();
|
|
||||||
|
|
||||||
minX = 1000;
|
|
||||||
maxX = 0;
|
|
||||||
minY = 1000;
|
|
||||||
maxY = 0;
|
|
||||||
prev = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
chunkWords.add(word);
|
|
||||||
|
|
||||||
prev = word;
|
|
||||||
if (word.getMinXDirAdj() < minX) {
|
|
||||||
minX = word.getMinXDirAdj();
|
|
||||||
}
|
|
||||||
if (word.getMaxXDirAdj() > maxX) {
|
|
||||||
maxX = word.getMaxXDirAdj();
|
|
||||||
}
|
|
||||||
if (word.getMinYDirAdj() < minY) {
|
|
||||||
minY = word.getMinYDirAdj();
|
|
||||||
}
|
|
||||||
if (word.getMaxYDirAdj() > maxY) {
|
|
||||||
maxY = word.getMaxYDirAdj();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
|
||||||
if (cb1 != null) {
|
|
||||||
chunkBlockList.add(cb1);
|
|
||||||
}
|
|
||||||
|
|
||||||
return chunkBlockList;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean equalsWithThreshold(float f1, float f2) {
|
|
||||||
|
|
||||||
return Math.abs(f1 - f2) < THRESHOLD;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||||
|
|
||||||
return new TextPageBlock(wordBlockList);
|
return new TextPageBlock(wordBlockList);
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user