cells) {
@@ -243,8 +234,8 @@ public class Table extends AbstractTextContainer {
uniqueY.add(c.getTop());
});
- var sortedUniqueX = uniqueX.stream().sorted().collect(Collectors.toList());
- var sortedUniqueY = uniqueY.stream().sorted().collect(Collectors.toList());
+ var sortedUniqueX = uniqueX.stream().sorted().toList();
+ var sortedUniqueY = uniqueY.stream().sorted().toList();
Float prevY = null;
for (Float y : sortedUniqueY) {
@@ -258,9 +249,7 @@ public class Table extends AbstractTextContainer {
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
- if (intersectionCell.isPresent()) {
- cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks());
- }
+ intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
if (cell.hasMinimumSize()) {
row.add(cell);
}
@@ -268,7 +257,7 @@ public class Table extends AbstractTextContainer {
prevX = x;
}
- if (prevY != null && prevX != null) {
+ if (prevY != null && prevX != null && !row.isEmpty()) {
matrix.add(row);
}
prevY = y;
@@ -299,7 +288,7 @@ public class Table extends AbstractTextContainer {
}
if (column != null && column.getTextBlocks() != null) {
boolean first = true;
- for (TextBlock textBlock : column.getTextBlocks()) {
+ for (TextPageBlock textBlock : column.getTextBlocks()) {
if (!first) {
sb.append("\n");
}
@@ -331,7 +320,7 @@ public class Table extends AbstractTextContainer {
sb.append(i == 0 ? "\n| " : "\n | ");
if (column != null && column.getTextBlocks() != null) {
boolean first = true;
- for (TextBlock textBlock : column.getTextBlocks()) {
+ for (TextPageBlock textBlock : column.getTextBlocks()) {
if (!first) {
sb.append(" ");
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/RedTextPosition.java
similarity index 95%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/RedTextPosition.java
index 241900a1..392b1eb0 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/RedTextPosition.java
@@ -1,4 +1,4 @@
-package com.iqser.red.service.redaction.v1.server.parsing.model;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.BeanUtils;
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SearchableText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SearchableText.java
new file mode 100644
index 00000000..36677e1f
--- /dev/null
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SearchableText.java
@@ -0,0 +1,49 @@
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
+
+import lombok.Getter;
+
+public class SearchableText {
+
+ @Getter
+ private final List sequences = new ArrayList<>();
+
+
+ public void add(TextPositionSequence textPositionSequence) {
+
+ sequences.add(textPositionSequence);
+ }
+
+
+ public void addAll(List textPositionSequences) {
+
+ sequences.addAll(textPositionSequences);
+ }
+
+
+ @Override
+ public String toString() {
+
+ return buildString(sequences);
+ }
+
+
+ public static String buildString(List sequences) {
+
+ StringBuilder sb = new StringBuilder();
+ for (TextPositionSequence word : sequences) {
+ sb.append(word);
+ sb.append(' ');
+ }
+ String text = sb.toString();
+ text = TextNormalizationUtilities.removeHyphenLineBreaks(text);
+ text = TextNormalizationUtilities.removeLineBreaks(text);
+ text = TextNormalizationUtilities.removeRepeatingWhitespaces(text);
+ return text;
+ }
+
+}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SimplifiedSectionText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SimplifiedSectionText.java
similarity index 74%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SimplifiedSectionText.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SimplifiedSectionText.java
index 58b11aae..5c1ba630 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SimplifiedSectionText.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SimplifiedSectionText.java
@@ -1,5 +1,4 @@
-package com.iqser.red.service.redaction.v1.server.classification.model;
-
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import lombok.AllArgsConstructor;
import lombok.Builder;
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SimplifiedText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SimplifiedText.java
similarity index 79%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SimplifiedText.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SimplifiedText.java
index 496d64ae..2d4eabf4 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SimplifiedText.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SimplifiedText.java
@@ -1,4 +1,4 @@
-package com.iqser.red.service.redaction.v1.server.classification.model;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import java.util.ArrayList;
import java.util.List;
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/StringFrequencyCounter.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/StringFrequencyCounter.java
similarity index 93%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/StringFrequencyCounter.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/StringFrequencyCounter.java
index 757f6dc3..2186cc25 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/StringFrequencyCounter.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/StringFrequencyCounter.java
@@ -1,10 +1,10 @@
-package com.iqser.red.service.redaction.v1.server.classification.model;
-
-import lombok.Getter;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import java.util.HashMap;
import java.util.Map;
+import lombok.Getter;
+
public class StringFrequencyCounter {
@Getter
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextDirection.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextDirection.java
similarity index 66%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextDirection.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextDirection.java
index 003eca70..aceb3751 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextDirection.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextDirection.java
@@ -1,6 +1,4 @@
-package com.iqser.red.service.redaction.v1.server.parsing.model;
-
-import java.util.Objects;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
@@ -46,18 +44,4 @@ public enum TextDirection {
throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees));
}
-
-
- public static TextDirection fromString(String degreesAsString) {
-
- Objects.requireNonNull(degreesAsString, "Cannot construct a text direction from a null value");
-
- String value = degreesAsString.strip();
-
- if (degreesAsString.endsWith(VALUE_STRING_SUFFIX)) {
- value = degreesAsString.replace(VALUE_STRING_SUFFIX + "$", "");
- }
-
- return fromDegrees(Float.parseFloat(value));
- }
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextPageBlock.java
similarity index 88%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextPageBlock.java
index 3061541b..ca8cc8e2 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextPageBlock.java
@@ -1,13 +1,12 @@
-package com.iqser.red.service.redaction.v1.server.classification.model;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
-import com.iqser.red.service.redaction.v1.server.parsing.model.TextDirection;
-import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import lombok.AllArgsConstructor;
import lombok.Builder;
@@ -18,7 +17,7 @@ import lombok.NoArgsConstructor;
@Builder
@Data
@NoArgsConstructor
-public class TextBlock extends AbstractTextContainer {
+public class TextPageBlock extends AbstractPageBlock {
@Builder.Default
private List sequences = new ArrayList<>();
@@ -45,7 +44,7 @@ public class TextBlock extends AbstractTextContainer {
private float highestFontSize;
@JsonIgnore
- private String classification;
+ private PageBlockType classification;
@JsonIgnore
@@ -95,6 +94,7 @@ public class TextBlock extends AbstractTextContainer {
}
}
+
/**
* Returns the maxX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
@@ -174,7 +174,7 @@ public class TextBlock extends AbstractTextContainer {
}
- public TextBlock(float minX, float maxX, float minY, float maxY, List sequences, int rotation) {
+ public TextPageBlock(float minX, float maxX, float minY, float maxY, List sequences, int rotation) {
this.minX = minX;
this.maxX = maxX;
@@ -185,23 +185,23 @@ public class TextBlock extends AbstractTextContainer {
}
- public TextBlock union(TextPositionSequence r) {
+ public TextPageBlock union(TextPositionSequence r) {
- TextBlock union = this.copy();
+ TextPageBlock union = this.copy();
union.add(r);
return union;
}
- public TextBlock union(TextBlock r) {
+ public TextPageBlock union(TextPageBlock r) {
- TextBlock union = this.copy();
+ TextPageBlock union = this.copy();
union.add(r);
return union;
}
- public void add(TextBlock r) {
+ public void add(TextPageBlock r) {
if (r.getMinX() < minX) {
minX = r.getMinX();
@@ -236,9 +236,9 @@ public class TextBlock extends AbstractTextContainer {
}
- public TextBlock copy() {
+ public TextPageBlock copy() {
- return new TextBlock(minX, maxX, minY, maxY, sequences, rotation);
+ return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation);
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextPositionSequence.java
similarity index 99%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextPositionSequence.java
index b1aecb99..48a33a6b 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextPositionSequence.java
@@ -1,4 +1,4 @@
-package com.iqser.red.service.redaction.v1.server.parsing.model;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/UnclassifiedText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/UnclassifiedText.java
new file mode 100644
index 00000000..50925713
--- /dev/null
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/UnclassifiedText.java
@@ -0,0 +1,14 @@
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
+
+import java.util.List;
+
+import lombok.AllArgsConstructor;
+import lombok.Data;
+
+@Data
+@AllArgsConstructor
+public class UnclassifiedText {
+
+ private List textBlocks;
+
+}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/LegacyPDFStreamEngine.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/LegacyPDFStreamEngine.java
similarity index 99%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/LegacyPDFStreamEngine.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/LegacyPDFStreamEngine.java
index bb00562e..5d96ca09 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/LegacyPDFStreamEngine.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/LegacyPDFStreamEngine.java
@@ -14,34 +14,18 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package com.iqser.red.service.redaction.v1.server.parsing;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing;
-import java.io.InputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.util.Map;
import java.util.WeakHashMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-
import org.apache.fontbox.ttf.TrueTypeFont;
import org.apache.fontbox.util.BoundingBox;
-
import org.apache.pdfbox.contentstream.PDFStreamEngine;
-import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
-import org.apache.pdfbox.pdmodel.common.PDRectangle;
-import org.apache.pdfbox.pdmodel.font.PDCIDFont;
-import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
-import org.apache.pdfbox.pdmodel.font.PDFont;
-import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
-import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
-import org.apache.pdfbox.pdmodel.font.PDType0Font;
-import org.apache.pdfbox.pdmodel.font.PDType3Font;
-import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
-import org.apache.pdfbox.text.TextPosition;
-import org.apache.pdfbox.util.Matrix;
-import org.apache.pdfbox.util.Vector;
import org.apache.pdfbox.contentstream.operator.DrawObject;
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
import org.apache.pdfbox.contentstream.operator.state.Restore;
@@ -50,22 +34,36 @@ import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
import org.apache.pdfbox.contentstream.operator.text.BeginText;
import org.apache.pdfbox.contentstream.operator.text.EndText;
-import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
-import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling;
-import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted;
-import org.apache.pdfbox.contentstream.operator.text.ShowTextLine;
-import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace;
import org.apache.pdfbox.contentstream.operator.text.MoveText;
import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading;
import org.apache.pdfbox.contentstream.operator.text.NextLine;
import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing;
+import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
+import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling;
import org.apache.pdfbox.contentstream.operator.text.SetTextLeading;
import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode;
import org.apache.pdfbox.contentstream.operator.text.SetTextRise;
import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing;
import org.apache.pdfbox.contentstream.operator.text.ShowText;
+import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted;
+import org.apache.pdfbox.contentstream.operator.text.ShowTextLine;
+import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace;
import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDRectangle;
+import org.apache.pdfbox.pdmodel.font.PDCIDFont;
+import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
+import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
+import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
+import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
+import org.apache.pdfbox.pdmodel.font.PDType0Font;
+import org.apache.pdfbox.pdmodel.font.PDType3Font;
+import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
+import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
+import org.apache.pdfbox.text.TextPosition;
+import org.apache.pdfbox.util.Matrix;
+import org.apache.pdfbox.util.Vector;
/**
* LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper.
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/PDFLinesTextStripper.java
similarity index 86%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/PDFLinesTextStripper.java
index 560d5b2c..fda2c3cb 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/PDFLinesTextStripper.java
@@ -1,17 +1,32 @@
-package com.iqser.red.service.redaction.v1.server.parsing;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing;
-import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
-import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
-
-import lombok.Getter;
-import lombok.Setter;
-import lombok.extern.slf4j.Slf4j;
+import java.awt.geom.Point2D;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
-import org.apache.pdfbox.contentstream.operator.color.*;
-import org.apache.pdfbox.contentstream.operator.state.*;
+import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor;
+import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN;
+import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace;
+import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor;
+import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor;
+import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
+import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
+import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
+import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
+import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
+import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
+import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
+import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
+import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
+import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
+import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
+import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
+import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
+import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSNumber;
@@ -19,11 +34,13 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.TextPosition;
-import java.awt.geom.Point2D;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.List;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.RedTextPosition;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
+
+import lombok.Getter;
+import lombok.Setter;
+import lombok.extern.slf4j.Slf4j;
@Slf4j
public class PDFLinesTextStripper extends PDFTextStripper {
@@ -264,8 +281,8 @@ public class PDFLinesTextStripper extends PDFTextStripper {
// Remove false sequence ends (whitespaces)
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
- for (TextPosition t : sublist) {
- textPositionSequences.get(textPositionSequences.size() - 1).add(t);
+ for (TextPosition textPosition : sublist) {
+ textPositionSequences.get(textPositionSequences.size() - 1).add(textPosition);
}
} else {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/PDFTextStripper.java
similarity index 99%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFTextStripper.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/PDFTextStripper.java
index a0eeaaa2..18be3d0e 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFTextStripper.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/PDFTextStripper.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package com.iqser.red.service.redaction.v1.server.parsing;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing;
import java.io.BufferedInputStream;
import java.io.IOException;
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BlockificationService.java
similarity index 66%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BlockificationService.java
index 2cff711a..d67447a9 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BlockificationService.java
@@ -1,4 +1,4 @@
-package com.iqser.red.service.redaction.v1.server.classification.service;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import static java.util.stream.Collectors.toSet;
@@ -9,15 +9,15 @@ import java.util.List;
import org.springframework.stereotype.Service;
-import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
-import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
-import com.iqser.red.service.redaction.v1.server.classification.model.Page;
-import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter;
-import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
-import com.iqser.red.service.redaction.v1.server.classification.utils.RulingTextDirAdjustUtil;
-import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.Orientation;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil;
@Service
@SuppressWarnings("all")
@@ -29,16 +29,18 @@ public class BlockificationService {
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
- * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
- * @param textPositions The words of a page.
+ * Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
+ *
+ * @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
- * @param verticalRulingLines Vertical table lines.
+ * @param verticalRulingLines Vertical table lines.
* @return Page object that contains the Textblock and text statistics.
*/
- public Page blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) {
+ public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) {
+ int indexOnPage = 0;
List chunkWords = new ArrayList<>();
- List chunkBlockList1 = new ArrayList<>();
+ List chunkBlockList = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
@@ -58,12 +60,14 @@ public class BlockificationService {
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
Orientation prevOrientation = null;
- if (!chunkBlockList1.isEmpty()) {
- prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
+ if (!chunkBlockList.isEmpty()) {
+ prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
}
- TextBlock cb1 = buildTextBlock(chunkWords);
- chunkBlockList1.add(cb1);
+ TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
+ indexOnPage++;
+
+ chunkBlockList.add(cb1);
chunkWords = new ArrayList<>();
if (splitByX && !isSplitByRuling) {
@@ -102,17 +106,17 @@ public class BlockificationService {
}
}
- TextBlock cb1 = buildTextBlock(chunkWords);
+ TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
if (cb1 != null) {
- chunkBlockList1.add(cb1);
+ chunkBlockList.add(cb1);
}
- Iterator itty = chunkBlockList1.iterator();
+ Iterator itty = chunkBlockList.iterator();
- TextBlock previousLeft = null;
- TextBlock previousRight = null;
+ TextPageBlock previousLeft = null;
+ TextPageBlock previousRight = null;
while (itty.hasNext()) {
- TextBlock block = (TextBlock) itty.next();
+ TextPageBlock block = (TextPageBlock) itty.next();
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
@@ -137,10 +141,10 @@ public class BlockificationService {
}
}
- itty = chunkBlockList1.iterator();
- TextBlock previous = null;
+ itty = chunkBlockList.iterator();
+ TextPageBlock previous = null;
while (itty.hasNext()) {
- TextBlock block = (TextBlock) itty.next();
+ TextPageBlock block = (TextPageBlock) itty.next();
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
@@ -153,7 +157,7 @@ public class BlockificationService {
previous = block;
}
- return new Page(chunkBlockList1);
+ return new ClassificationPage(chunkBlockList);
}
@@ -163,9 +167,9 @@ public class BlockificationService {
}
- private TextBlock buildTextBlock(List wordBlockList) {
+ private TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) {
- TextBlock textBlock = null;
+ TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
@@ -182,9 +186,14 @@ public class BlockificationService {
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
- textBlock = new TextBlock(wordBlock.getMinXDirAdj(), wordBlock.getMaxXDirAdj(), wordBlock.getMinYDirAdj(), wordBlock.getMaxYDirAdj(), wordBlockList, wordBlock.getRotation());
+ textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
+ wordBlock.getMaxXDirAdj(),
+ wordBlock.getMinYDirAdj(),
+ wordBlock.getMaxYDirAdj(),
+ wordBlockList,
+ wordBlock.getRotation());
} else {
- TextBlock spatialEntity = textBlock.union(wordBlock);
+ TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
@@ -213,10 +222,38 @@ public class BlockificationService {
List horizontalRulingLines,
List verticalRulingLines) {
- return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
- || isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
- || isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
- || isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()); //
+ return isSplitByRuling(maxX,
+ minY,
+ word.getMinXDirAdj(),
+ word.getMinYDirAdj(),
+ verticalRulingLines,
+ word.getDir().getDegrees(),
+ word.getPageWidth(),
+ word.getPageHeight()) //
+ || isSplitByRuling(minX,
+ minY,
+ word.getMinXDirAdj(),
+ word.getMaxYDirAdj(),
+ horizontalRulingLines,
+ word.getDir().getDegrees(),
+ word.getPageWidth(),
+ word.getPageHeight()) //
+ || isSplitByRuling(maxX,
+ minY,
+ word.getMinXDirAdj(),
+ word.getMinYDirAdj(),
+ horizontalRulingLines,
+ word.getDir().getDegrees(),
+ word.getPageWidth(),
+ word.getPageHeight()) //
+ || isSplitByRuling(minX,
+ minY,
+ word.getMinXDirAdj(),
+ word.getMaxYDirAdj(),
+ verticalRulingLines,
+ word.getDir().getDegrees(),
+ word.getPageWidth(),
+ word.getPageHeight());
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BodyTextFrameService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BodyTextFrameService.java
similarity index 75%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BodyTextFrameService.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BodyTextFrameService.java
index a8874081..c695c3c4 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BodyTextFrameService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BodyTextFrameService.java
@@ -1,4 +1,4 @@
-package com.iqser.red.service.redaction.v1.server.classification.service;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import java.util.List;
@@ -6,17 +6,20 @@ import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
-import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
-import com.iqser.red.service.redaction.v1.server.classification.model.Page;
-import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
-import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.PositionUtils;
@Service
public class BodyTextFrameService {
+ private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.9f;
+
+
/**
* Adjusts and sets the body text frame to a page.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
@@ -30,7 +33,7 @@ public class BodyTextFrameService {
* @param bodyTextFrame frame that contains the main text on portrait pages
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
*/
- public void setBodyTextFrameAdjustedToPage(Page page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
+ public void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
@@ -65,26 +68,26 @@ public class BodyTextFrameService {
* @param landscape Calculate for landscape or portrait
* @return Rectangle of the text frame
*/
- public Rectangle calculateBodyTextFrame(List pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
+ public Rectangle calculateBodyTextFrame(List pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
- for (Page page : pages) {
+ for (ClassificationPage page : pages) {
if (page.getTextBlocks().isEmpty() || landscape != page.isLandscape()) {
continue;
}
- for (AbstractTextContainer container : page.getTextBlocks()) {
+ for (AbstractPageBlock container : page.getTextBlocks()) {
- if (container instanceof TextBlock) {
- TextBlock textBlock = (TextBlock) container;
+ if (container instanceof TextPageBlock) {
+ TextPageBlock textBlock = (TextPageBlock) container;
if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) {
continue;
}
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
- if (approxLineCount < 2.9f) {
+ if (approxLineCount < APPROXIMATE_HEADER_LINE_COUNT) {
continue;
}
@@ -94,15 +97,15 @@ public class BodyTextFrameService {
}
}
- if (container instanceof Table) {
- Table table = (Table) container;
+ if (container instanceof TablePageBlock) {
+ TablePageBlock table = (TablePageBlock) container;
for (List row : table.getRows()) {
for (Cell cell : row) {
if (cell == null || cell.getTextBlocks() == null) {
continue;
}
- for (TextBlock textBlock : cell.getTextBlocks()) {
+ for (TextPageBlock textBlock : cell.getTextBlocks()) {
expandRectangle(textBlock, page, expansionsRectangle);
}
}
@@ -117,7 +120,7 @@ public class BodyTextFrameService {
}
- private void expandRectangle(TextBlock textBlock, Page page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
+ private void expandRectangle(TextPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
if (textBlock.getPdfMinY() < expansionsRectangle.minX) {
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/ClassificationService.java
similarity index 64%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/ClassificationService.java
index c5247fda..ea325637 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/ClassificationService.java
@@ -1,4 +1,4 @@
-package com.iqser.red.service.redaction.v1.server.classification.service;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import java.util.List;
import java.util.regex.Pattern;
@@ -6,11 +6,12 @@ import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
-import com.iqser.red.service.redaction.v1.server.classification.model.Document;
-import com.iqser.red.service.redaction.v1.server.classification.model.Page;
-import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
-import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@@ -23,7 +24,7 @@ public class ClassificationService {
private final BodyTextFrameService bodyTextFrameService;
- public void classifyDocument(Document document) {
+ public void classifyDocument(ClassificationDocument document) {
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
@@ -31,43 +32,43 @@ public class ClassificationService {
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
- for (Page page : document.getPages()) {
+ for (ClassificationPage page : document.getPages()) {
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(page, document, headlineFontSizes);
}
}
- public void classifyPage(Page page, Document document, List headlineFontSizes) {
+ public void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes) {
- for (AbstractTextContainer textBlock : page.getTextBlocks()) {
- if (textBlock instanceof TextBlock) {
- classifyBlock((TextBlock) textBlock, page, document, headlineFontSizes);
+ for (AbstractPageBlock textBlock : page.getTextBlocks()) {
+ if (textBlock instanceof TextPageBlock) {
+ classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
- public void classifyBlock(TextBlock textBlock, Page page, Document document, List headlineFontSizes) {
+ public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
if (document.getFontSizeCounter().getMostPopular() == null) {
- textBlock.setClassification("Other");
+ textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
- textBlock.setClassification("Header");
+ textBlock.setClassification(PageBlockType.HEADER);
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
- textBlock.setClassification("Footer");
+ textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
- textBlock.setClassification("Title");
+ textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
@@ -80,36 +81,34 @@ public class ClassificationService {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
- textBlock.setClassification("H " + i);
+ textBlock.setClassification(PageBlockType.getHeadlineType(i));
document.setHeadlines(true);
}
}
- } else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame,
- textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter()
- .getMostPopular()
- .equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
+ } else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
+ .equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
- textBlock.setClassification("H " + (headlineFontSizes.size() + 1));
+ textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
- textBlock.setClassification("TextBlock Bold");
+ textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
- textBlock.setClassification("TextBlock");
+ textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
- textBlock.setClassification("TextBlock Italic");
+ textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
- textBlock.setClassification("TextBlock Unknown");
+ textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
} else {
- textBlock.setClassification("Other");
+ textBlock.setClassification(PageBlockType.OTHER);
}
}
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/PdfSegmentationService.java
similarity index 61%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/PdfSegmentationService.java
index 8223b17f..000c1343 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/PdfSegmentationService.java
@@ -1,4 +1,4 @@
-package com.iqser.red.service.redaction.v1.server.segmentation;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import java.io.File;
import java.io.FileOutputStream;
@@ -16,21 +16,19 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
-import com.iqser.red.service.redaction.v1.server.classification.model.Document;
-import com.iqser.red.service.redaction.v1.server.classification.model.Page;
-import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
-import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
-import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
-import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
-import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
-import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
-import com.iqser.red.service.redaction.v1.server.redaction.model.PdfTableCell;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.ImageServiceResponseAdapter;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.TableServiceResponseAdapter;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.PdfTableCell;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.CleanRulings;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing.PDFLinesTextStripper;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.FileUtils;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
-import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
-import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
-import com.iqser.red.service.redaction.v1.server.tableextraction.utils.FileUtils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@@ -46,18 +44,21 @@ public class PdfSegmentationService {
private final BlockificationService blockificationService;
private final ClassificationService classificationService;
private final SectionsBuilderService sectionsBuilderService;
- private final ImageService imageService;
- private final TableService tableService;
+ private final ImageServiceResponseAdapter imageServiceResponseAdapter;
+ private final TableServiceResponseAdapter tableServiceResponseAdapter;
- public Document parseDocument(String dossierId, String fileId, InputStream documentInputStream, Map> pdfImages) throws IOException {
+ public ClassificationDocument parseDocument(String dossierId,
+ String fileId,
+ InputStream documentInputStream,
+ Map> pdfImages) throws IOException {
PDDocument pdDocument = null;
File tempFile = null;
try {
Map> pdfTableCells = new HashMap<>();
if (redactionServiceSettings.isCvTableParsingEnabled()) {
- pdfTableCells = tableService.convertTables(dossierId, fileId);
+ pdfTableCells = tableServiceResponseAdapter.convertTables(dossierId, fileId);
}
tempFile = FileUtils.createTempFile("document", ".pdf");
@@ -65,8 +66,8 @@ public class PdfSegmentationService {
IOUtils.copy(documentInputStream, fos);
// initialize required variables
- Document document = new Document();
- List pages = new ArrayList<>();
+ ClassificationDocument document = new ClassificationDocument();
+ List pages = new ArrayList<>();
pdDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupMixed(67108864L));
pdDocument.setAllSecurityToBeRemoved(true);
@@ -94,12 +95,12 @@ public class PdfSegmentationService {
}
- private void processPage(Map> pdfImages,
- PDDocument pdDocument,
- Map> pdfTableCells,
- Document document,
- List pages,
- int pageNumber) throws IOException {
+ private void processPage(Map> pdfImages,
+ PDDocument pdDocument,
+ Map> pdfTableCells,
+ ClassificationDocument document,
+ List pages,
+ int pageNumber) throws IOException {
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
@@ -119,7 +120,7 @@ public class PdfSegmentationService {
stripper.getRulings(),
stripper.getMinCharWidth(),
stripper.getMaxCharHeight());
- Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
+ ClassificationPage page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
page.setRotation(rotation);
page.setLandscape(isLandscape);
@@ -130,7 +131,7 @@ public class PdfSegmentationService {
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
page.setImages(pdfImages.get(pageNumber));
- imageService.findOcr(page);
+ imageServiceResponseAdapter.findOcr(page);
}
tableExtractionService.extractTables(cleanRulings, page);
@@ -141,7 +142,7 @@ public class PdfSegmentationService {
}
- private void increaseDocumentStatistics(Page page, Document document) {
+ private void increaseDocumentStatistics(ClassificationPage page, ClassificationDocument document) {
if (!page.isLandscape()) {
document.getFontSizeCounter().addAll(page.getFontSizeCounter().getCountPerValue());
@@ -152,15 +153,15 @@ public class PdfSegmentationService {
}
- private void buildPageStatistics(Page page) {
+ private void buildPageStatistics(ClassificationPage page) {
// Collect all statistics for the page, except from blocks inside tables, as tables will always be added to BodyTextFrame.
- for (AbstractTextContainer textBlock : page.getTextBlocks()) {
- if (textBlock instanceof TextBlock) {
- if (((TextBlock) textBlock).getSequences() == null) {
+ for (AbstractPageBlock textBlock : page.getTextBlocks()) {
+ if (textBlock instanceof TextPageBlock) {
+ if (((TextPageBlock) textBlock).getSequences() == null) {
continue;
}
- for (TextPositionSequence word : ((TextBlock) textBlock).getSequences()) {
+ for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
page.getTextHeightCounter().add(word.getTextHeight());
page.getFontCounter().add(word.getFont());
page.getFontSizeCounter().add(word.getFontSize());
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/RulingCleaningService.java
similarity index 94%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/RulingCleaningService.java
index dcc9c498..25f88849 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/RulingCleaningService.java
@@ -1,4 +1,4 @@
-package com.iqser.red.service.redaction.v1.server.tableextraction.service;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
@@ -12,11 +12,11 @@ import java.util.Map;
import org.springframework.stereotype.Service;
-import com.iqser.red.service.redaction.v1.server.redaction.model.PdfTableCell;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.PdfTableCell;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.CleanRulings;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.Utils;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
-import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/SectionsBuilderService.java
similarity index 51%
rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java
rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/SectionsBuilderService.java
index fddcb26f..48b7ccbf 100644
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/SectionsBuilderService.java
@@ -1,9 +1,8 @@
-package com.iqser.red.service.redaction.v1.server.segmentation;
+package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
-import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@@ -11,17 +10,18 @@ import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
-import com.iqser.red.service.redaction.v1.server.classification.model.Document;
-import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
-import com.iqser.red.service.redaction.v1.server.classification.model.Header;
-import com.iqser.red.service.redaction.v1.server.classification.model.Page;
-import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
-import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
-import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
-import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
-import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationFooter;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationHeader;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationSection;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
+import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.UnclassifiedText;
import lombok.extern.slf4j.Slf4j;
@@ -29,23 +29,23 @@ import lombok.extern.slf4j.Slf4j;
@Service
public class SectionsBuilderService {
- public void buildSections(Document document) {
+ public void buildSections(ClassificationDocument document) {
- List chunkWords = new ArrayList<>();
- List chunkBlockList = new ArrayList<>();
- List headers = new ArrayList<>();
- List | | |