Merge branch 'RED-8694' into 'master'

RED-8694 - Add Javadoc to classes/methods used in rules

Closes RED-8694

See merge request redactmanager/redaction-service!369
This commit is contained in:
Andrei Isvoran 2024-04-11 14:52:33 +02:00
commit e9043c930a
23 changed files with 1158 additions and 47 deletions

View File

@ -11,6 +11,10 @@ import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
/**
* Represents a collection of named entity recognition (NER) entities.
* This class provides methods to manage and query NER entities.
*/
@Getter
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
@ -25,6 +29,12 @@ public class NerEntities {
}
/**
* Checks if there are any entities of a specified type.
*
* @param type The type of entity to check for.
* @return true if there is at least one entity of the specified type, false otherwise.
*/
public boolean hasEntitiesOfType(String type) {
return nerEntityList.stream()
@ -32,6 +42,12 @@ public class NerEntities {
}
/**
* Returns a stream of NER entities of a specified type.
*
* @param type The type of entities to return.
* @return a stream of {@link NerEntity} objects of the specified type.
*/
public Stream<NerEntity> streamEntitiesOfType(String type) {
return nerEntityList.stream()
@ -39,6 +55,9 @@ public class NerEntities {
}
/**
* Represents a single NER entity with its value, text range, and type.
*/
public record NerEntity(String value, TextRange textRange, String type) {
}

View File

@ -23,6 +23,9 @@ import com.iqser.red.service.redaction.v1.server.utils.exception.NotFoundExcepti
import lombok.Data;
import lombok.Getter;
/**
* A class representing a dictionary used for redaction processes, containing various dictionary models and their versions.
*/
@Data
public class Dictionary {
@ -51,6 +54,11 @@ public class Dictionary {
}
/**
* Checks if the dictionary contains local entries.
*
* @return true if any dictionary model contains local entries, false otherwise.
*/
public boolean hasLocalEntries() {
return dictionaryModels.stream()
@ -64,6 +72,13 @@ public class Dictionary {
}
/**
* Retrieves the {@link DictionaryModel} of a specified type.
*
* @param type The type of dictionary model to retrieve.
* @return The {@link DictionaryModel} of the specified type.
* @throws NotFoundException If the specified type is not found in the dictionary.
*/
public DictionaryModel getType(String type) {
DictionaryModel model = localAccessMap.get(type);
@ -74,6 +89,12 @@ public class Dictionary {
}
/**
* Checks if the dictionary of a specific type is considered a hint.
*
* @param type The type of dictionary to check.
* @return true if the dictionary model is marked as a hint, false otherwise.
*/
public boolean isHint(String type) {
DictionaryModel model = localAccessMap.get(type);
@ -84,6 +105,12 @@ public class Dictionary {
}
/**
* Checks if the dictionary of a specific type is case-insensitive.
*
* @param type The type of dictionary to check.
* @return true if the dictionary is case-insensitive, false otherwise.
*/
public boolean isCaseInsensitiveDictionary(String type) {
DictionaryModel dictionaryModel = localAccessMap.get(type);
@ -94,6 +121,18 @@ public class Dictionary {
}
/**
* Adds a local dictionary entry of a specific type.
*
* @param type The type of dictionary to add the entry to.
* @param value The value of the entry.
* @param matchedRules A collection of {@link MatchedRule} associated with the entry.
* @param alsoAddLastname Indicates whether to also add the lastname separately as an entry.
* @throws IllegalArgumentException If the specified type does not exist within the dictionary, if the type
* does not have any local entries defined, or if the provided value is
* blank. This ensures that only valid, non-empty entries
* are added to the dictionary.
*/
private void addLocalDictionaryEntry(String type, String value, Collection<MatchedRule> matchedRules, boolean alsoAddLastname) {
if (value.isBlank()) {
@ -133,18 +172,33 @@ public class Dictionary {
}
/**
* Recommends a text entity for inclusion in every dictionary model without separating the last name.
*
* @param textEntity The {@link TextEntity} to be recommended.
*/
public void recommendEverywhere(TextEntity textEntity) {
addLocalDictionaryEntry(textEntity.type(), textEntity.getValue(), textEntity.getMatchedRuleList(), false);
}
/**
* Recommends a text entity for inclusion in every dictionary model with the last name added separately.
*
* @param textEntity The {@link TextEntity} to be recommended.
*/
public void recommendEverywhereWithLastNameSeparately(TextEntity textEntity) {
addLocalDictionaryEntry(textEntity.type(), textEntity.getValue(), textEntity.getMatchedRuleList(), true);
}
/**
* Adds multiple author names contained within a text entity as recommendations in the dictionary.
*
* @param textEntity The {@link TextEntity} containing author names to be added.
*/
public void addMultipleAuthorsAsRecommendation(TextEntity textEntity) {
splitIntoAuthorNames(textEntity).forEach(authorName -> addLocalDictionaryEntry(textEntity.type(), authorName, textEntity.getMatchedRuleList(), true));
@ -152,6 +206,12 @@ public class Dictionary {
}
/**
* Splits a {@link TextEntity} into individual author names based on commas or new lines.
*
* @param textEntity The {@link TextEntity} to split.
* @return A list of strings where each string is an author name.
*/
public static List<String> splitIntoAuthorNames(TextEntity textEntity) {
List<String> splitAuthorNames;

View File

@ -13,6 +13,12 @@ import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRu
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
/**
* Represents a model of a dictionary containing entries for redaction processes.
* It includes various types of entries such as standard entries, false positives,
* and false recommendations. Additionally, it manages local entries with matched
* rules for enhanced search and matching capabilities.
*/
@Data
@Slf4j
public class DictionaryModel implements Serializable {
@ -36,6 +42,19 @@ public class DictionaryModel implements Serializable {
private transient SearchImplementation localSearch;
/**
* Constructs a new DictionaryModel with specified parameters.
*
* @param type The type of the dictionary model.
* @param rank The rank order of the dictionary model.
* @param color An array representing the color associated with this model.
* @param caseInsensitive Flag indicating whether the dictionary is case-insensitive.
* @param hint Flag indicating whether this model should be used as a hint.
* @param entries Set of dictionary entry models representing the entries.
* @param falsePositives Set of dictionary entry models representing false positives.
* @param falseRecommendations Set of dictionary entry models representing false recommendations.
* @param isDossierDictionary Flag indicating whether this model is for a dossier dictionary.
*/
public DictionaryModel(String type,
int rank,
float[] color,
@ -52,13 +71,17 @@ public class DictionaryModel implements Serializable {
this.caseInsensitive = caseInsensitive;
this.hint = hint;
this.isDossierDictionary = isDossierDictionary;
this.entries = entries;
this.falsePositives = falsePositives;
this.falseRecommendations = falseRecommendations;
}
/**
* Returns the search implementation for local entries.
*
* @return The {@link SearchImplementation} for local entries.
*/
public SearchImplementation getLocalSearch() {
if (this.localSearch == null || this.localSearch.getValues().size() != this.localEntriesWithMatchedRules.size()) {
@ -68,6 +91,11 @@ public class DictionaryModel implements Serializable {
}
/**
* Returns the search implementation for non-deleted dictionary entries.
*
* @return The {@link SearchImplementation} for non-deleted dictionary entries.
*/
public SearchImplementation getEntriesSearch() {
if (entriesSearch == null) {
@ -80,6 +108,11 @@ public class DictionaryModel implements Serializable {
}
/**
* Returns the search implementation for deleted dictionary entries.
*
* @return The {@link SearchImplementation} for deleted dictionary entries.
*/
public SearchImplementation getDeletionEntriesSearch() {
if (deletionEntriesSearch == null) {
@ -92,6 +125,11 @@ public class DictionaryModel implements Serializable {
}
/**
* Returns the search implementation for non-deleted false positive entries.
*
* @return The {@link SearchImplementation} for non-deleted false positive entries.
*/
public SearchImplementation getFalsePositiveSearch() {
if (falsePositiveSearch == null) {
@ -104,6 +142,11 @@ public class DictionaryModel implements Serializable {
}
/**
* Returns the search implementation for non-deleted false recommendation entries.
*
* @return The {@link SearchImplementation} for non-deleted false recommendation entries.
*/
public SearchImplementation getFalseRecommendationsSearch() {
if (falseRecommendationsSearch == null) {
@ -116,12 +159,17 @@ public class DictionaryModel implements Serializable {
}
/**
* Retrieves the matched rules for a given value from the local dictionary entries.
* The value is processed based on the case sensitivity of the dictionary.
*
* @param value The value for which to retrieve the matched rules.
* @return A set of {@link MatchedRule} associated with the given value, or null if no rules are found.
*/
public Set<MatchedRule> getMatchedRulesForLocalDictionaryEntry(String value) {
var cleanedValue = isCaseInsensitive() ? value.toLowerCase(Locale.US) : value;
return localEntriesWithMatchedRules.get(cleanedValue);
}
}

View File

@ -11,6 +11,10 @@ import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBl
import lombok.EqualsAndHashCode;
import lombok.Setter;
/**
* Represents a range of text defined by a start and end index.
* Provides functionality to check containment, intersection, and to adjust ranges based on specified conditions.
*/
@Setter
@EqualsAndHashCode
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
@ -20,6 +24,13 @@ public class TextRange implements Comparable<TextRange> {
private int end;
/**
* Constructs a TextRange with specified start and end indexes.
*
* @param start The starting index of the range.
* @param end The ending index of the range.
* @throws IllegalArgumentException If start is greater than end.
*/
public TextRange(int start, int end) {
if (start > end) {
@ -30,6 +41,11 @@ public class TextRange implements Comparable<TextRange> {
}
/**
* Returns the length of the text range.
*
* @return The length of the range.
*/
public int length() {
return end - start;
@ -48,18 +64,38 @@ public class TextRange implements Comparable<TextRange> {
}
/**
* Checks if this {@link TextRange} fully contains another TextRange.
*
* @param textRange The {@link TextRange} to check.
* @return true if this range contains the specified range, false otherwise.
*/
public boolean contains(TextRange textRange) {
return start <= textRange.start() && textRange.end() <= end;
}
/**
* Checks if this {@link TextRange} is fully contained by another TextRange.
*
* @param textRange The {@link TextRange} to check against.
* @return true if this range is contained by the specified range, false otherwise.
*/
public boolean containedBy(TextRange textRange) {
return textRange.contains(this);
}
/**
* Checks if this {@link TextRange} contains another range specified by start and end indices.
*
* @param start The starting index of the range to check.
* @param end The ending index of the range to check.
* @return true if this range fully contains the specified range, false otherwise.
* @throws IllegalArgumentException If the start index is greater than the end index.
*/
public boolean contains(int start, int end) {
if (start > end) {
@ -69,6 +105,14 @@ public class TextRange implements Comparable<TextRange> {
}
/**
* Checks if this {@link TextRange} is fully contained within another range specified by start and end indices.
*
* @param start The starting index of the outer range.
* @param end The ending index of the outer range.
* @return true if this range is fully contained within the specified range, false otherwise.
* @throws IllegalArgumentException If the start index is greater than the end index.
*/
public boolean containedBy(int start, int end) {
if (start > end) {
@ -78,18 +122,37 @@ public class TextRange implements Comparable<TextRange> {
}
/**
* Determines if the specified index is within this {@link TextRange}.
*
* @param index The index to check.
* @return true if the index is within the range (inclusive of the start and exclusive of the end), false otherwise.
*/
public boolean contains(int index) {
return start <= index && index < end;
}
/**
* Checks if this {@link TextRange} intersects with another {@link TextRange}.
*
* @param textRange The {@link TextRange} to check for intersection.
* @return true if the ranges intersect, false otherwise.
*/
public boolean intersects(TextRange textRange) {
return textRange.start() < this.end && this.start < textRange.end();
}
/**
* Splits this TextRange into multiple ranges based on a list of indices.
*
* @param splitIndices The indices at which to split the range.
* @return A list of TextRanges resulting from the split.
* @throws IndexOutOfBoundsException If any split index is outside this TextRange.
*/
public List<TextRange> split(List<Integer> splitIndices) {
if (splitIndices.stream()
@ -116,6 +179,13 @@ public class TextRange implements Comparable<TextRange> {
}
/**
* Merges a collection of TextRanges into a single Text range encompassing all.
*
* @param boundaries The collection of TextRanges to merge.
* @return A new TextRange covering the entire span of the given ranges.
* @throws IllegalArgumentException If boundaries are empty.
*/
public static TextRange merge(Collection<TextRange> boundaries) {
int minStart = boundaries.stream()
@ -152,16 +222,17 @@ public class TextRange implements Comparable<TextRange> {
/**
* shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without trailing or preceding whitespaces.
* Shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without trailing or preceding whitespaces.
*
* @param textBlock TextBlock to check whitespaces against
* @return trimmed boundary
* @return Trimmed boundary
*/
public TextRange trim(TextBlock textBlock) {
if (this.length() == 0) {
return this;
}
int trimmedStart = this.start;
while (textBlock.containsIndex(trimmedStart) && trimmedStart < end && Character.isWhitespace(textBlock.charAt(trimmedStart))) {
trimmedStart++;

View File

@ -12,27 +12,64 @@ import lombok.NonNull;
public interface IEntity {
/**
* Gets the list of rules matched against this entity.
*
* @return A priority queue of matched rules.
*/
PriorityQueue<MatchedRule> getMatchedRuleList();
/**
* Gets the manual overwrite actions applied to this entity, if any.
*
* @return The manual overwrite details.
*/
ManualChangeOverwrite getManualOverwrite();
/**
* Gets the value of this entity as a string.
*
* @return The string value.
*/
String getValue();
/**
* Gets the range of text in the document associated with this entity.
*
* @return The text range.
*/
TextRange getTextRange();
/**
* Gets the type of this entity.
*
* @return The entity type.
*/
String type();
/**
* Calculates the length of the entity's value.
*
* @return The length of the value.
*/
default int length() {
return value().length();
}
/**
* Retrieves the value of the entity, considering any manual overwrite.
* If no manual overwrite value is found, return the value of the entity or an empty string
* if that value is null.
*
* @return The possibly overwritten value
*/
default String value() {
return getManualOverwrite().getValue()
@ -40,6 +77,11 @@ public interface IEntity {
}
/**
* Determines if the entity has been applied, considering manual overwrites.
*
* @return True if applied, false otherwise.
*/
// Don't use default accessor pattern (e.g. isApplied()), as it might lead to errors in drools due to property-specific optimization of the drools planner.
default boolean applied() {
@ -48,12 +90,22 @@ public interface IEntity {
}
/**
* Determines if the entity has been skipped, based on its applied status.
*
* @return True if skipped, false otherwise.
*/
default boolean skipped() {
return !applied();
}
/**
* Determines if the entity has been ignored, considering manual overwrites.
*
* @return True if ignored, false otherwise.
*/
default boolean ignored() {
return getManualOverwrite().getIgnored()
@ -61,6 +113,11 @@ public interface IEntity {
}
/**
* Determines if the entity has been removed, considering manual overwrites.
*
* @return True if removed, false otherwise.
*/
default boolean removed() {
return getManualOverwrite().getRemoved()
@ -68,6 +125,11 @@ public interface IEntity {
}
/**
* Checks if the entity has been resized, considering manual overwrites.
*
* @return True if resized, false otherwise.
*/
default boolean resized() {
return getManualOverwrite().getResized()
@ -75,24 +137,48 @@ public interface IEntity {
}
/**
* Checks if the entity is considered active, based on its removed and ignored status.
* An active entry is not removed or ignored.
*
* @return True if active, false otherwise.
*/
default boolean active() {
return !(removed() || ignored());
}
/**
* Checks if there are any manual changes applied to the entity.
*
* @return True if there are manual changes, false otherwise.
*/
default boolean hasManualChanges() {
return !getManualOverwrite().getManualChangeLog().isEmpty();
}
/**
* Retrieves a set of references associated with the entity's matched rule.
*
* @return A set of references.
*/
default Set<TextEntity> references() {
return getMatchedRule().getReferences();
}
/**
* Applies a redaction to the entity with a specified legal basis.
*
* @param ruleIdentifier The identifier of the rule being applied.
* @param reason The reason for the redaction.
* @param legalBasis The legal basis for the redaction, which must not be blank or empty.
* @throws IllegalArgumentException If the legal basis is blank or empty.
*/
default void redact(@NonNull String ruleIdentifier, String reason, @NonNull String legalBasis) {
if (legalBasis.isBlank() || legalBasis.isEmpty()) {
@ -102,36 +188,75 @@ public interface IEntity {
}
/**
* Applies a rule to the entity with an optional legal basis.
*
* @param ruleIdentifier The identifier of the rule being applied.
* @param reason The reason for applying the rule.
* @param legalBasis The legal basis for the application, can be a default or unspecified value.
*/
default void apply(@NonNull String ruleIdentifier, String reason, String legalBasis) {
addMatchedRule(MatchedRule.builder().ruleIdentifier(RuleIdentifier.fromString(ruleIdentifier)).reason(reason).legalBasis(legalBasis).applied(true).build());
}
/**
* Applies a rule to the entity without specifying a legal basis, which will be replaced by "n-a".
*
* @param ruleIdentifier The identifier of the rule being applied.
* @param reason The reason for applying the rule.
*/
default void apply(@NonNull String ruleIdentifier, String reason) {
apply(ruleIdentifier, reason, "n-a");
}
/**
* Marks the entity as skipped according to a specific rule.
*
* @param ruleIdentifier The identifier of the rule being skipped.
* @param reason The reason for skipping the rule.
*/
default void skip(@NonNull String ruleIdentifier, String reason) {
addMatchedRule(MatchedRule.builder().ruleIdentifier(RuleIdentifier.fromString(ruleIdentifier)).reason(reason).build());
}
/**
* Marks the entity as removed according to a specific rule.
*
* @param ruleIdentifier The identifier of the rule based on which the entity is removed.
* @param reason The reason for the removal.
*/
default void remove(String ruleIdentifier, String reason) {
addMatchedRule(MatchedRule.builder().ruleIdentifier(RuleIdentifier.fromString(ruleIdentifier)).reason(reason).removed(true).build());
}
/**
* Marks the entity as ignored according to a specific rule.
*
* @param ruleIdentifier The identifier of the rule based on which the entity is removed.
* @param reason The reason for the removal.
*/
default void ignore(String ruleIdentifier, String reason) {
addMatchedRule(MatchedRule.builder().ruleIdentifier(RuleIdentifier.fromString(ruleIdentifier)).reason(reason).ignored(true).build());
}
/**
* Applies a rule to the entity, indicating that the value should be written with line breaks.
*
* @param ruleIdentifier The identifier of the rule being applied.
* @param reason The reason for the rule application.
* @param legalBasis The legal basis for the rule, which must not be empty.
* @throws IllegalArgumentException If the legal basis is blank or empty.
*/
default void applyWithLineBreaks(@NonNull String ruleIdentifier, String reason, @NonNull String legalBasis) {
if (legalBasis.isBlank() || legalBasis.isEmpty()) {
@ -147,6 +272,15 @@ public interface IEntity {
}
/**
* Applies a rule to the entity with a collection of references.
*
* @param ruleIdentifier The identifier of the rule being applied.
* @param reason The reason for the rule application.
* @param legalBasis The legal basis for the rule, which must not be empty.
* @param references A collection of text entities that are referenced by this rule application.
* @throws IllegalArgumentException If the legal basis is blank or empty.
*/
default void applyWithReferences(@NonNull String ruleIdentifier, String reason, @NonNull String legalBasis, Collection<TextEntity> references) {
if (legalBasis.isBlank() || legalBasis.isEmpty()) {
@ -162,18 +296,35 @@ public interface IEntity {
}
/**
* Marks the entity as skipped for a specific rule and associates a collection of references.
*
* @param ruleIdentifier The identifier of the rule being skipped.
* @param reason The reason for skipping the rule.
* @param references A collection of text entities that are referenced by the skipped rule.
*/
default void skipWithReferences(@NonNull String ruleIdentifier, String reason, Collection<TextEntity> references) {
getMatchedRuleList().add(MatchedRule.builder().ruleIdentifier(RuleIdentifier.fromString(ruleIdentifier)).reason(reason).references(new HashSet<>(references)).build());
}
/**
* Adds a single matched rule to this entity.
*
* @param matchedRule The matched rule to add.
*/
default void addMatchedRule(MatchedRule matchedRule) {
getMatchedRuleList().add(matchedRule);
}
/**
* Adds a collection of matched rules to this entity.
*
* @param matchedRules The collection of matched rules to add.
*/
default void addMatchedRules(Collection<MatchedRule> matchedRules) {
if (getMatchedRuleList().equals(matchedRules)) {
@ -183,12 +334,22 @@ public interface IEntity {
}
/**
* Retrieves the 'unit' value of the highest priority matched rule.
*
* @return The unit value of the matched rule.
*/
default int getMatchedRuleUnit() {
return getMatchedRule().getRuleIdentifier().unit();
}
/**
* Gets the highest priority matched rule for this entity.
*
* @return The matched rule.
*/
default MatchedRule getMatchedRule() {
if (getMatchedRuleList().isEmpty()) {
@ -198,6 +359,11 @@ public interface IEntity {
}
/**
* Builds a reason string for this entity, incorporating descriptions from manual changes.
*
* @return The built reason string.
*/
default String buildReasonWithManualChangeDescriptions() {
if (getManualOverwrite().getDescriptions().isEmpty()) {
@ -210,6 +376,11 @@ public interface IEntity {
}
/**
* Retrieves the legal basis for the action taken on this entity, considering any manual overwrite.
*
* @return The legal basis.
*/
default String legalBasis() {
return getManualOverwrite().getLegalBasis()

View File

@ -15,6 +15,9 @@ import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
/**
* Represents a rule that has been matched during the document redaction process.
*/
@Getter
@Builder
@AllArgsConstructor
@ -42,12 +45,26 @@ public final class MatchedRule implements Comparable<MatchedRule> {
Set<TextEntity> references = Collections.emptySet();
/**
* Creates an empty instance of {@link MatchedRule}.
* This can be used as a placeholder or when no rule is actually matched.
*
* @return An empty {@link MatchedRule} instance.
*/
public static MatchedRule empty() {
return MatchedRule.builder().ruleIdentifier(RuleIdentifier.empty()).build();
}
/**
* Returns a modified instance of {@link MatchedRule} based on its applied status.
* If the rule has been applied, it returns a new {@link MatchedRule} instance that retains all properties of the original
* except for the 'applied' status, which is set to false.
* If the rule has not been applied, it returns the original instance.
*
* @return A {@link MatchedRule} instance with 'applied' set to false.
*/
public MatchedRule asSkippedIfApplied() {
if (!this.isApplied()) {
@ -63,6 +80,13 @@ public final class MatchedRule implements Comparable<MatchedRule> {
}
/**
* Compares this rule with another {@link MatchedRule} to establish a priority order.
* The comparison is based on the rule type, unit, and ID, in that order.
*
* @param matchedRule The {@link MatchedRule} to compare against.
* @return A negative integer, zero, or a positive integer as this rule is less than, equal to, or greater than the specified rule.
*/
@Override
public int compareTo(MatchedRule matchedRule) {

View File

@ -24,6 +24,9 @@ import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
/**
* Represents the entire document as a node within the document's semantic structure.
*/
@Data
@Builder
@AllArgsConstructor
@ -63,6 +66,11 @@ public class Document implements GenericSemanticNode {
}
/**
* Gets the main sections of the document as a list.
*
* @return A list of main sections within the document.
*/
public List<Section> getMainSections() {
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node)
@ -70,6 +78,11 @@ public class Document implements GenericSemanticNode {
}
/**
* Streams all terminal (leaf) text blocks within the document in their natural order.
*
* @return A stream of terminal {@link TextBlock}.
*/
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock);
@ -99,6 +112,11 @@ public class Document implements GenericSemanticNode {
}
/**
* Streams all nodes within the document, regardless of type, in their natural order.
*
* @return A stream of all {@link SemanticNode} within the document.
*/
private Stream<SemanticNode> streamAllNodes() {
return documentTree.allEntriesInOrder()
@ -106,6 +124,11 @@ public class Document implements GenericSemanticNode {
}
/**
* Streams all image nodes contained within the document.
*
* @return A stream of {@link Image} nodes.
*/
public Stream<Image> streamAllImages() {
return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node);

View File

@ -19,6 +19,9 @@ import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
/**
* Represents the header part of a document page.
*/
@Data
@Builder
@AllArgsConstructor

View File

@ -20,6 +20,9 @@ import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
/**
* Represents a headline in a document.
*/
@Data
@Builder
@AllArgsConstructor
@ -98,12 +101,22 @@ public class Headline implements GenericSemanticNode {
}
/**
* Creates an empty headline with no text content.
*
* @return An empty {@link Headline} instance.
*/
public static Headline empty() {
return Headline.builder().leafTextBlock(AtomicTextBlock.empty(-1L, 0, new Page(), -1, null)).build();
}
/**
* Checks if this headline is associated with any paragraphs within its parent section or node.
*
* @return True if there are paragraphs associated with this headline, false otherwise.
*/
public boolean hasParagraphs() {
return getParent().streamAllSubNodesOfType(NodeType.PARAGRAPH)

View File

@ -28,6 +28,10 @@ import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
/**
*
Represents an image within the document.
*/
@Data
@Builder
@AllArgsConstructor

View File

@ -17,6 +17,9 @@ import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
/**
* Represents a single page in a document.
*/
@Getter
@Setter
@Builder
@ -43,6 +46,11 @@ public class Page {
Set<Image> images = new HashSet<>();
/**
* Constructs and returns a {@link TextBlock} representing the concatenated text of all leaf semantic nodes in the main body.
*
* @return The main body text block.
*/
public TextBlock getMainBodyTextBlock() {
return mainBody.stream()

View File

@ -19,6 +19,9 @@ import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
/**
* Represents a paragraph in the document.
*/
@Data
@SuperBuilder
@AllArgsConstructor

View File

@ -21,6 +21,9 @@ import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
/**
* Represents a section within a document, encapsulating both its textual content and semantic structure.
*/
@Slf4j
@Data
@Builder
@ -51,6 +54,11 @@ public class Section implements GenericSemanticNode {
}
/**
* Checks if this section contains any tables.
*
* @return True if the section contains at least one table, false otherwise.
*/
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
@ -91,12 +99,24 @@ public class Section implements GenericSemanticNode {
}
/**
* Checks if any headline within this section or its sub-nodes contains a given string.
*
* @param value The string to search for within headlines, case-sensitive.
* @return True if at least one headline contains the specified string, false otherwise.
*/
public boolean anyHeadlineContainsString(String value) {
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsString(value));
}
/**
* Checks if any headline within this section or its sub-nodes contains a given string, case-insensitive.
*
* @param value The string to search for within headlines, case-insensitive.
* @return True if at least one headline contains the specified string, false otherwise.
*/
public boolean anyHeadlineContainsStringIgnoreCase(String value) {
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsStringIgnoreCase(value));

View File

@ -10,6 +10,9 @@ import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.experimental.FieldDefaults;
/**
* Represents a unique identifier for a section within a document.
*/
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SectionIdentifier {
@ -28,6 +31,12 @@ public class SectionIdentifier {
boolean asChild;
/**
* Generates a SectionIdentifier from the headline text of a section, determining its format and structure.
*
* @param headline The headline text from which to generate the section identifier.
* @return A {@link SectionIdentifier} instance corresponding to the headline text.
*/
public static SectionIdentifier fromSearchText(String headline) {
if (headline == null || headline.isEmpty() || headline.isBlank()) {
@ -43,18 +52,34 @@ public class SectionIdentifier {
}
/**
* Marks the current section identifier as a child of another section.
*
* @param sectionIdentifier The parent section identifier.
* @return A new {@link SectionIdentifier} instance marked as a child.
*/
public static SectionIdentifier asChildOf(SectionIdentifier sectionIdentifier) {
return new SectionIdentifier(sectionIdentifier.format, sectionIdentifier.toString(), sectionIdentifier.identifiers, true);
}
/**
* Generates a SectionIdentifier that represents the entire document.
*
* @return A {@link SectionIdentifier} with a document-wide scope.
*/
public static SectionIdentifier document() {
return new SectionIdentifier(Format.DOCUMENT, "document", Collections.emptyList(), false);
}
/**
* Generates an empty SectionIdentifier.
*
* @return An empty {@link SectionIdentifier} instance.
*/
public static SectionIdentifier empty() {
return new SectionIdentifier(Format.EMPTY, "empty", Collections.emptyList(), false);
@ -109,6 +134,12 @@ public class SectionIdentifier {
}
/**
* Determines if the current section is a child of the given section, based on their identifiers.
*
* @param sectionIdentifier The section identifier to compare against.
* @return True if the current section is a child of the given section, false otherwise.
*/
public boolean isChildOf(SectionIdentifier sectionIdentifier) {
if (this.format.equals(Format.DOCUMENT) || this.format.equals(Format.EMPTY)) {

View File

@ -44,11 +44,12 @@ public interface SemanticNode {
*/
default TextBlock getTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
return streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getTextBlock)
.collect(new TextBlockCollector());
}
/**
* Any Node maintains its own Set of Entities.
* This Set contains all Entities whose TextRange intersects the TextRange of this node.
@ -437,10 +438,10 @@ public interface SemanticNode {
/**
* Checks whether this SemanticNode contains all the provided Strings ignoring case.
* Checks whether this SemanticNode contains all the provided Strings case-insensitive.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string ignoring case
* @return true, if this node's TextBlock contains the string case-insensitive
*/
default boolean containsStringIgnoreCase(String string) {
@ -449,7 +450,7 @@ public interface SemanticNode {
/**
* Checks whether this SemanticNode contains any of the provided Strings ignoring case.
* Checks whether this SemanticNode contains any of the provided Strings case-insensitive.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
@ -462,7 +463,7 @@ public interface SemanticNode {
/**
* Checks whether this SemanticNode contains any of the provided Strings ignoring case.
* Checks whether this SemanticNode contains any of the provided Strings case-insensitive.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
@ -489,7 +490,7 @@ public interface SemanticNode {
/**
* Checks whether this SemanticNode contains exactly the provided String as a word ignoring case.
* Checks whether this SemanticNode contains exactly the provided String as a word case-insensitive.
*
* @param word - String which the TextBlock might contain
* @return true, if this node's TextBlock contains string
@ -519,7 +520,7 @@ public interface SemanticNode {
/**
* Checks whether this SemanticNode contains any of the provided Strings as a word ignoring case.
* Checks whether this SemanticNode contains any of the provided Strings as a word case-insensitive.
*
* @param words - A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the provided strings
@ -551,7 +552,7 @@ public interface SemanticNode {
/**
* Checks whether this SemanticNode contains all the provided Strings as word ignoring case.
* Checks whether this SemanticNode contains all the provided Strings as word case-insensitive.
*
* @param words - A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains all the provided strings
@ -580,10 +581,10 @@ public interface SemanticNode {
/**
* Checks whether this SemanticNode matches the provided regex pattern ignoring case.
* Checks whether this SemanticNode matches the provided regex pattern case-insensitive.
*
* @param regexPattern A String representing a regex pattern, which the TextBlock might contain
* @return true, if this node's TextBlock contains the regex pattern ignoring case
* @return true, if this node's TextBlock contains the regex pattern case-insensitive
*/
default boolean matchesRegexIgnoreCase(String regexPattern) {

View File

@ -26,6 +26,9 @@ import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
/**
* Represents a table within a document.
*/
@Data
@Builder
@AllArgsConstructor

View File

@ -20,6 +20,9 @@ import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
/**
* Represents a single table cell within a table.
*/
@Data
@Builder
@AllArgsConstructor

View File

@ -58,6 +58,12 @@ public class ManualChangesApplicationService {
}
/**
* Resizes a text entity based on manual resize redaction details.
*
* @param entityToBeResized The entity to resize.
* @param manualResizeRedaction The details of the resize operation.
*/
public void resize(TextEntity entityToBeResized, ManualResizeRedaction manualResizeRedaction) {
resizeEntityAndReinsert(entityToBeResized, manualResizeRedaction);
@ -140,6 +146,12 @@ public class ManualChangesApplicationService {
}
/**
* Resizes an image entity based on manual resize redaction instructions.
*
* @param image The image to resize.
* @param manualResizeRedaction The details of the resize operation.
*/
public void resizeImage(Image image, ManualResizeRedaction manualResizeRedaction) {
if (manualResizeRedaction.getPositions().isEmpty() || manualResizeRedaction.getPositions() == null) {

View File

@ -54,6 +54,17 @@ public class EntityCreationService {
}
/**
* Creates entities found between specified start and stop strings, case-sensitive.
*
* @param start The starting string to search for.
* @param stop The stopping string to search for.
* @param type The type of entity to create.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which to search.
* @return A stream of {@link TextEntity} identified objects.
* @throws IllegalArgumentException if both start and stop strings are empty, indicating there's nothing to search for.
*/
public Stream<TextEntity> betweenStrings(String start, String stop, String type, EntityType entityType, SemanticNode node) {
checkIfBothStartAndEndAreEmpty(start, stop);
@ -65,6 +76,17 @@ public class EntityCreationService {
}
/**
* Creates entities found between specified start and stop strings, case-insensitive.
*
* @param start The starting string to search for.
* @param stop The stopping string to search for.
* @param type The type of entity to create.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which to search.
* @return A stream of {@link TextEntity} identified objects.
* @throws IllegalArgumentException if both start and stop strings are empty, indicating there's nothing to search for.
*/
public Stream<TextEntity> betweenStringsIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
checkIfBothStartAndEndAreEmpty(start, stop);
@ -76,6 +98,17 @@ public class EntityCreationService {
}
/**
* Creates entities found between specified start and stop strings, including the start string in the entity, case-sensitive.
*
* @param start The starting string to search for.
* @param stop The stopping string to search for.
* @param type The type of entity to create.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which to search.
* @return A stream of {@link TextEntity} identified objects.
* @throws IllegalArgumentException if both start and stop strings are empty, indicating there's nothing to search for.
*/
public Stream<TextEntity> betweenStringsIncludeStart(String start, String stop, String type, EntityType entityType, SemanticNode node) {
checkIfBothStartAndEndAreEmpty(start, stop);
@ -92,6 +125,17 @@ public class EntityCreationService {
}
/**
* Creates entities found between specified start and stop strings, including the start string in the entity, case-insensitive.
*
* @param start The starting string to search for.
* @param stop The stopping string to search for.
* @param type The type of entity to create.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which to search.
* @return A stream of {@link TextEntity} identified objects.
* @throws IllegalArgumentException if both start and stop strings are empty, indicating there's nothing to search for.
*/
public Stream<TextEntity> betweenStringsIncludeStartIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
checkIfBothStartAndEndAreEmpty(start, stop);
@ -108,6 +152,17 @@ public class EntityCreationService {
}
/**
* Creates entities found between specified start and stop strings, including the end string in the entity, case-sensitive.
*
* @param start The starting string to search for.
* @param stop The stopping string to search for.
* @param type The type of entity to create.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which to search.
* @return A stream of {@link TextEntity} identified objects.
* @throws IllegalArgumentException if both start and stop strings are empty, indicating there's nothing to search for.
*/
public Stream<TextEntity> betweenStringsIncludeEnd(String start, String stop, String type, EntityType entityType, SemanticNode node) {
checkIfBothStartAndEndAreEmpty(start, stop);
@ -124,6 +179,17 @@ public class EntityCreationService {
}
/**
* Creates entities found between specified start and stop strings, including the end string in the entity, case-insensitive.
*
* @param start The starting string to search for.
* @param stop The stopping string to search for.
* @param type The type of entity to create.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which to search.
* @return A stream of {@link TextEntity} identified objects.
* @throws IllegalArgumentException if both start and stop strings are empty, indicating there's nothing to search for.
*/
public Stream<TextEntity> betweenStringsIncludeEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
checkIfBothStartAndEndAreEmpty(start, stop);
@ -140,6 +206,17 @@ public class EntityCreationService {
}
/**
* Creates entities found between specified start and stop strings, including the start and end string in the entity, case-sensitive.
*
* @param start The starting string to search for.
* @param stop The stopping string to search for.
* @param type The type of entity to create.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which to search.
* @return A stream of {@link TextEntity} identified objects.
* @throws IllegalArgumentException if both start and stop strings are empty, indicating there's nothing to search for.
*/
public Stream<TextEntity> betweenStringsIncludeStartAndEnd(String start, String stop, String type, EntityType entityType, SemanticNode node) {
checkIfBothStartAndEndAreEmpty(start, stop);
@ -160,6 +237,17 @@ public class EntityCreationService {
}
/**
* Creates entities found between specified start and stop strings, including the start and end string in the entity, case-insensitive.
*
* @param start The starting string to search for.
* @param stop The stopping string to search for.
* @param type The type of entity to create.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which to search.
* @return A stream of {@link TextEntity} identified objects.
* @throws IllegalArgumentException if both start and stop strings are empty, indicating there's nothing to search for.
*/
public Stream<TextEntity> betweenStringsIncludeStartAndEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
checkIfBothStartAndEndAreEmpty(start, stop);
@ -180,6 +268,17 @@ public class EntityCreationService {
}
/**
* Identifies the shortest text entities found between any of the given start and stop strings within a specified semantic node, case-sensitive.
*
* @param starts A list of start strings to search for.
* @param stops A list of stop strings to search for.
* @param type The type of the entity to be created.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which to search.
* @return A stream of {@link TextEntity} identified objects.
* @throws IllegalArgumentException if both start and stop strings are empty, indicating there's nothing to search for.
*/
public Stream<TextEntity> shortestBetweenAnyString(List<String> starts, List<String> stops, String type, EntityType entityType, SemanticNode node) {
checkIfBothStartAndEndAreEmpty(starts, stops);
@ -191,6 +290,17 @@ public class EntityCreationService {
}
/**
* Identifies the shortest text entities found between any of the given start and stop strings within a specified semantic node, case-insensitive.
*
* @param starts A list of start strings to search for.
* @param stops A list of stop strings to search for.
* @param type The type of the entity to be created.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which to search.
* @return A stream of {@link TextEntity} identified objects.
* @throws IllegalArgumentException if both start and stop strings are empty, indicating there's nothing to search for.
*/
public Stream<TextEntity> shortestBetweenAnyStringIgnoreCase(List<String> starts, List<String> stops, String type, EntityType entityType, SemanticNode node) {
checkIfBothStartAndEndAreEmpty(starts, stops);
@ -202,6 +312,18 @@ public class EntityCreationService {
}
/**
* Identifies the shortest text entities found between any of the given start and stop strings within a specified semantic node,
* case-insensitive, with a length limit.
*
* @param starts A list of start strings to search for, case-insensitively.
* @param stops A list of stop strings to search for, case-insensitively.
* @param type The type of the entity to be created.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which the search is performed.
* @param limit The maximum length of the entity text.
* @return A stream of {@link TextEntity} objects found between any of the start and stop strings, case-insensitively, and within the specified limit.
*/
public Stream<TextEntity> shortestBetweenAnyStringIgnoreCase(List<String> starts, List<String> stops, String type, EntityType entityType, SemanticNode node, int limit) {
checkIfBothStartAndEndAreEmpty(starts, stops);
@ -213,6 +335,16 @@ public class EntityCreationService {
}
/**
* Creates entities based on the boundaries identified between start and stop regular expressions within a specified semantic node.
*
* @param regexStart The regular expression defining the start boundary.
* @param regexStop The regular expression defining the stop boundary.
* @param type The type of entity to be created.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which the search is performed.
* @return A stream of {@link TextEntity} objects identified between the start and stop regular expressions.
*/
public Stream<TextEntity> betweenRegexes(String regexStart, String regexStop, String type, EntityType entityType, SemanticNode node) {
TextBlock textBlock = node.getTextBlock();
@ -223,6 +355,17 @@ public class EntityCreationService {
}
/**
* Creates entities based on the boundaries identified between start and stop regular expressions within a specified semantic node,
* case-insensitive.
*
* @param regexStart The regular expression defining the start boundary, case-insensitive.
* @param regexStop The regular expression defining the stop boundary, case-insensitive.
* @param type The type of entity to be created.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which the search is performed.
* @return A stream of {@link TextEntity} objects identified between the start and stop regular expressions, case-insensitively.
*/
public Stream<TextEntity> betweenRegexesIgnoreCase(String regexStart, String regexStop, String type, EntityType entityType, SemanticNode node) {
TextBlock textBlock = node.getTextBlock();
@ -233,12 +376,35 @@ public class EntityCreationService {
}
/**
* Creates entities based on the boundaries identified between specified start and stop text ranges within a semantic node.
* This is a more general method that can be used directly with lists of start and stop {@link TextRange} objects.
*
* @param startBoundaries A list of start text range boundaries.
* @param stopBoundaries A list of stop text range boundaries.
* @param type The type of entity to be created.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which the search is performed.
* @return A stream of {@link TextEntity} objects identified between the start and stop text ranges.
*/
public Stream<TextEntity> betweenTextRanges(List<TextRange> startBoundaries, List<TextRange> stopBoundaries, String type, EntityType entityType, SemanticNode node) {
return betweenTextRanges(startBoundaries, stopBoundaries, type, entityType, node, 0);
}
/**
* Creates entities based on the boundaries identified between specified start and stop text ranges within a semantic node,
* with an optional length limit for the entities.
*
* @param startBoundaries A list of start text range boundaries.
* @param stopBoundaries A list of stop text range boundaries.
* @param type The type of entity to be created.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which the search is performed.
* @param limit The maximum length of the entity text; use 0 for no limit.
* @return A stream of {@link TextEntity} objects identified between the start and stop text ranges, within the specified limit.
*/
public Stream<TextEntity> betweenTextRanges(List<TextRange> startBoundaries, List<TextRange> stopBoundaries, String type, EntityType entityType, SemanticNode node, int limit) {
List<TextRange> entityBoundaries = findNonOverlappingBoundariesBetweenBoundariesWithMinimalDistances(startBoundaries, stopBoundaries);
@ -283,6 +449,15 @@ public class EntityCreationService {
}
/**
* Creates text entities based on boundaries identified by a search implementation within a specified semantic node.
*
* @param searchImplementation The search implementation to use for identifying boundaries.
* @param type The type of the entity to be created.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which the search is performed.
* @return A stream of {@link TextEntity} objects corresponding to the identified boundaries.
*/
public Stream<TextEntity> bySearchImplementation(SearchImplementation searchImplementation, String type, EntityType entityType, SemanticNode node) {
return searchImplementation.getBoundaries(node.getTextBlock(), node.getTextRange())
@ -294,6 +469,15 @@ public class EntityCreationService {
}
/**
* Identifies text entities located immediately after the specified strings within a semantic node.
*
* @param strings A list of strings to search for. The text immediately following each string is considered for entity creation.
* @param type The type of the entity to be created.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which the search is performed.
* @return A stream of {@link TextEntity} objects found immediately after the specified strings.
*/
public Stream<TextEntity> lineAfterStrings(List<String> strings, String type, EntityType entityType, SemanticNode node) {
TextBlock textBlock = node.getTextBlock();
@ -308,6 +492,15 @@ public class EntityCreationService {
}
/**
* Identifies text entities located immediately after the specified strings within a semantic node, case-insensitive.
*
* @param strings A list of strings to search for, case-insensitive. The text immediately following each string is considered for entity creation.
* @param type The type of the entity to be created.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which the search is performed.
* @return A stream of {@link TextEntity} objects found immediately after the specified strings, case-insensitively.
*/
public Stream<TextEntity> lineAfterStringsIgnoreCase(List<String> strings, String type, EntityType entityType, SemanticNode node) {
TextBlock textBlock = node.getTextBlock();
@ -322,6 +515,15 @@ public class EntityCreationService {
}
/**
* Identifies a text entity located immediately after a specified string within a semantic node.
*
* @param string The string to search for. The text immediately following this string is considered for entity creation.
* @param type The type of the entity to be created.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which the search is performed.
* @return A stream of {@link TextEntity} objects found immediately after the specified string.
*/
public Stream<TextEntity> lineAfterString(String string, String type, EntityType entityType, SemanticNode node) {
TextBlock textBlock = node.getTextBlock();
@ -335,6 +537,15 @@ public class EntityCreationService {
}
/**
* Identifies a text entity located immediately after a specified string within a semantic node, case-insensitive.
*
* @param string The string to search for, case-insensitive. The text immediately following this string is considered for entity creation.
* @param type The type of the entity to be created.
* @param entityType The detailed classification of the entity.
* @param node The semantic node within which the search is performed.
* @return A stream of {@link TextEntity} objects found immediately after the specified string, case-insensitively.
*/
public Stream<TextEntity> lineAfterStringIgnoreCase(String string, String type, EntityType entityType, SemanticNode node) {
TextBlock textBlock = node.getTextBlock();
@ -348,6 +559,15 @@ public class EntityCreationService {
}
/**
* Identifies text entities located immediately after a specified string across table cell columns within a table node.
*
* @param string The string to search for. The text immediately following this string in subsequent table cells is considered for entity creation.
* @param type The type of the entity to be created.
* @param entityType The detailed classification of the entity.
* @param tableNode The table node within which the search is performed.
* @return A stream of {@link TextEntity} objects found across table cell columns immediately after the specified string.
*/
public Stream<TextEntity> lineAfterStringAcrossColumns(String string, String type, EntityType entityType, Table tableNode) {
return tableNode.streamTableCells()
@ -359,6 +579,15 @@ public class EntityCreationService {
}
/**
* Identifies text entities located immediately after a specified string across table cell columns within a table node, case-insensitive.
*
* @param string The string to search for, case-insensitive. The text immediately following this string in subsequent table cells is considered for entity creation.
* @param type The type of the entity to be created.
* @param entityType The detailed classification of the entity.
* @param tableNode The table node within which the search is performed.
* @return A stream of {@link TextEntity} objects found across table cell columns immediately after the specified string, case-insensitively.
*/
public Stream<TextEntity> lineAfterStringAcrossColumnsIgnoreCase(String string, String type, EntityType entityType, Table tableNode) {
return tableNode.streamTableCells()
@ -397,6 +626,15 @@ public class EntityCreationService {
}
/**
* Attempts to create a text entity for text within a semantic node, immediately after a specified string.
*
* @param semanticNode The semantic node within which to search for the string.
* @param string The string after which the entity should be created.
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @return An {@link Optional} containing the created {@link TextEntity}, or {@link Optional#empty()} if the string is not found.
*/
public Optional<TextEntity> semanticNodeAfterString(SemanticNode semanticNode, String string, String type, EntityType entityType) {
var textBlock = semanticNode.getTextBlock();
@ -415,30 +653,77 @@ public class EntityCreationService {
}
/**
* Identifies text entities based on matches to a regular expression pattern within a semantic node's text block,
* considering line breaks in the text.
*
* @param regexPattern The regex pattern to match.
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @param node The semantic node containing the text block to search.
* @return A stream of identified {@link TextEntity} objects.
*/
public Stream<TextEntity> byRegexWithLineBreaks(String regexPattern, String type, EntityType entityType, SemanticNode node) {
return byRegexWithLineBreaks(regexPattern, type, entityType, 0, node);
}
/**
* Identifies text entities based on matches to a regular expression pattern within a semantic node's text block, considering line breaks in the text, case-insensitive.
*
* @param regexPattern The regex pattern to match.
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @param node The semantic node containing the text block to search.
* @return A stream of identified {@link TextEntity} objects.
*/
public Stream<TextEntity> byRegexWithLineBreaksIgnoreCase(String regexPattern, String type, EntityType entityType, SemanticNode node) {
return byRegexWithLineBreaksIgnoreCase(regexPattern, type, entityType, 0, node);
}
/**
* Identifies text entities based on matches to a regular expression pattern within a semantic node's text block.
*
* @param regexPattern The regex pattern to match.
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @param node The semantic node containing the text block to search.
* @return A stream of identified {@link TextEntity} objects.
*/
public Stream<TextEntity> byRegex(String regexPattern, String type, EntityType entityType, SemanticNode node) {
return byRegex(regexPattern, type, entityType, 0, node);
}
/**
* Identifies text entities based on matches to a regular expression pattern within a semantic node's text block, case-insensitive.
*
* @param regexPattern The regex pattern to match.
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @param node The semantic node containing the text block to search.
* @return A stream of identified {@link TextEntity} objects.
*/
public Stream<TextEntity> byRegexIgnoreCase(String regexPattern, String type, EntityType entityType, SemanticNode node) {
return byRegexIgnoreCase(regexPattern, type, entityType, 0, node);
}
/**
* Identifies text entities within a semantic node's text block based on a regex pattern that includes line breaks.
*
* @param regexPattern Regex pattern to match, including handling for line breaks.
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @param group The regex group to target for entity creation.
* @param node The semantic node to search within.
* @return A stream of {@link TextEntity} objects that match the regex pattern.
*/
public Stream<TextEntity> byRegexWithLineBreaks(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) {
return RedactionSearchUtility.findTextRangesByRegexWithLineBreaks(regexPattern, group, node.getTextBlock())
@ -449,6 +734,16 @@ public class EntityCreationService {
}
/**
* Identifies text entities within a semantic node's text block based on a regex pattern that includes line breaks, case-insensitive.
*
* @param regexPattern Regex pattern to match, including handling for line breaks.
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @param group The regex group to target for entity creation.
* @param node The semantic node to search within.
* @return A stream of {@link TextEntity} objects that match the regex pattern.
*/
public Stream<TextEntity> byRegexWithLineBreaksIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) {
return RedactionSearchUtility.findTextRangesByRegexWithLineBreaksIgnoreCase(regexPattern, group, node.getTextBlock())
@ -459,6 +754,16 @@ public class EntityCreationService {
}
/**
* Identifies text entities based on a simple regex pattern.
*
* @param regexPattern Regex pattern to match, including handling for line breaks.
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @param group The regex group to target for entity creation.
* @param node The semantic node to search within.
* @return A stream of {@link TextEntity} objects that match the regex pattern.
*/
public Stream<TextEntity> byRegex(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) {
return RedactionSearchUtility.findTextRangesByRegex(regexPattern, group, node.getTextBlock())
@ -469,6 +774,16 @@ public class EntityCreationService {
}
/**
* Identifies text entities based on a simple regex pattern, case-insensitive.
*
* @param regexPattern Regex pattern to match, including handling for line breaks.
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @param group The regex group to target for entity creation.
* @param node The semantic node to search within.
* @return A stream of {@link TextEntity} objects that match the regex pattern.
*/
public Stream<TextEntity> byRegexIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) {
return RedactionSearchUtility.findTextRangesByRegexIgnoreCase(regexPattern, group, node.getTextBlock())
@ -479,6 +794,15 @@ public class EntityCreationService {
}
/**
* Identifies text entities based on an exact string match within a semantic node's text block.
*
* @param keyword String keyword to search for.
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @param node The semantic node to search within.
* @return A stream of {@link TextEntity} objects that match the exact string.
*/
public Stream<TextEntity> byString(String keyword, String type, EntityType entityType, SemanticNode node) {
return RedactionSearchUtility.findTextRangesByString(keyword, node.getTextBlock())
@ -489,6 +813,15 @@ public class EntityCreationService {
}
/**
* Identifies text entities based on an exact string match within a semantic node's text block, case-insensitive.
*
* @param keyword String keyword to search for.
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @param node The semantic node to search within.
* @return A stream of {@link TextEntity} objects that match the exact string, case-insensitive.
*/
public Stream<TextEntity> byStringIgnoreCase(String keyword, String type, EntityType entityType, SemanticNode node) {
return RedactionSearchUtility.findTextRangesByStringIgnoreCase(keyword, node.getTextBlock())
@ -499,6 +832,14 @@ public class EntityCreationService {
}
/**
* Extracts text entities from paragraphs only, within a given semantic node.
*
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @param node The semantic node to search within.
* @return A stream of {@link TextEntity} objects extracted from paragraphs only.
*/
public Stream<TextEntity> bySemanticNodeParagraphsOnly(SemanticNode node, String type, EntityType entityType) {
return node.streamAllSubNodesOfType(NodeType.PARAGRAPH)
@ -508,6 +849,14 @@ public class EntityCreationService {
}
/**
* Merges consecutive paragraphs into a single text entity within a given semantic node.
*
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @param node The semantic node to search within.
* @return A stream of merged {@link TextEntity} objects from consecutive paragraphs.
*/
public Stream<TextEntity> bySemanticNodeParagraphsOnlyMergeConsecutive(SemanticNode node, String type, EntityType entityType) {
return node.streamAllSubNodesOfType(NodeType.PARAGRAPH)
@ -520,6 +869,15 @@ public class EntityCreationService {
}
/**
* Creates a text entity immediately following a specified string within a semantic node.
*
* @param string The string after which to create the entity.
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @param node The semantic node to search within.
* @return An {@link Optional} containing the created {@link TextEntity}, or {@link Optional#empty()} if not found.
*/
public Optional<TextEntity> semanticNodeAfterString(String string, String type, EntityType entityType, SemanticNode node) {
if (!node.containsString(string)) {
@ -530,6 +888,14 @@ public class EntityCreationService {
}
/**
* Creates a text entity based on the entire text range of a semantic node.
*
* @param node The semantic node to base the text entity on.
* @param type The type of entity to create.
* @param entityType The entity's classification.
* @return An {@link Optional} containing the created {@link TextEntity}, or {@link Optional#empty()} if not valid.
*/
public Optional<TextEntity> bySemanticNode(SemanticNode node, String type, EntityType entityType) {
TextRange textRange = node.getTextBlock().getTextRange();
@ -544,6 +910,13 @@ public class EntityCreationService {
}
/**
* Expands a text entity's start boundary based on a regex pattern match.
*
* @param entity The original text entity to expand.
* @param regexPattern The regex pattern used to find the new start boundary.
* @return An {@link Optional} containing the expanded {@link TextEntity}, or {@link Optional#empty()} if not valid.
*/
public Optional<TextEntity> byPrefixExpansionRegex(TextEntity entity, String regexPattern) {
int expandedStart = RedactionSearchUtility.getExpandedStartByRegex(entity, regexPattern);
@ -551,6 +924,13 @@ public class EntityCreationService {
}
/**
* Expands a text entity's end boundary based on a regex pattern match.
*
* @param entity The original text entity to expand.
* @param regexPattern The regex pattern used to find the new end boundary.
* @return An {@link Optional} containing the expanded {@link TextEntity}, or {@link Optional#empty()} if not valid.
*/
public Optional<TextEntity> bySuffixExpansionRegex(TextEntity entity, String regexPattern) {
int expandedEnd = RedactionSearchUtility.getExpandedEndByRegex(entity, regexPattern);
@ -594,7 +974,7 @@ public class EntityCreationService {
throw new IllegalArgumentException(String.format("%s is not in the %s of the provided semantic node %s", textRange, node.getTextRange(), node));
}
TextRange trimmedTextRange = textRange.trim(node.getTextBlock());
if (trimmedTextRange.length() == 0){
if (trimmedTextRange.length() == 0) {
return Optional.empty();
}
TextEntity entity = TextEntity.initialEntityNode(trimmedTextRange, type, entityType, node);
@ -646,6 +1026,16 @@ public class EntityCreationService {
}
/**
* Merges a list of text entities into a single entity, assuming they intersect and are of the same type.
*
* @param entitiesToMerge The list of entities to merge.
* @param type The type for the merged entity.
* @param entityType The entity's classification.
* @param node The semantic node related to these entities.
* @return A single merged {@link TextEntity}.
* @throws IllegalArgumentException If entities do not intersect or have different types.
*/
public TextEntity mergeEntitiesOfSameType(List<TextEntity> entitiesToMerge, String type, EntityType entityType, SemanticNode node) {
if (!allEntitiesIntersectAndHaveSameTypes(entitiesToMerge)) {
@ -683,11 +1073,22 @@ public class EntityCreationService {
addEntityToGraph(mergedEntity, node);
insertToKieSession(mergedEntity);
entitiesToMerge.stream().filter(e -> !e.equals(mergedEntity)).forEach(node.getEntities()::remove);
entitiesToMerge.stream()
.filter(e -> !e.equals(mergedEntity))
.forEach(node.getEntities()::remove);
return mergedEntity;
}
/**
* Copies a list of text entities, creating a new entity for each in the list with the same properties.
*
* @param entities The list of entities to copy.
* @param type The type for the copied entities.
* @param entityType The classification for the copied entities.
* @param node The semantic node related to these entities.
* @return A stream of copied {@link TextEntity} objects.
*/
public Stream<TextEntity> copyEntities(List<TextEntity> entities, String type, EntityType entityType, SemanticNode node) {
return entities.stream()
@ -695,6 +1096,15 @@ public class EntityCreationService {
}
/**
* Copies a single text entity, preserving all its matched rules.
*
* @param entity The entity to copy.
* @param type The type for the copied entity.
* @param entityType The classification for the copied entity.
* @param node The semantic node related to the entity.
* @return A copied {@link TextEntity} with matched rules.
*/
public TextEntity copyEntity(TextEntity entity, String type, EntityType entityType, SemanticNode node) {
var newEntity = copyEntityWithoutRules(entity, type, entityType, node);
@ -703,6 +1113,15 @@ public class EntityCreationService {
}
/**
* Copies a single text entity without its matched rules.
*
* @param entity The entity to copy.
* @param type The type for the copied entity.
* @param entityType The classification for the copied entity.
* @param node The semantic node related to the entity.
* @return A copied {@link TextEntity} without matched rules.
*/
public TextEntity copyEntityWithoutRules(TextEntity entity, String type, EntityType entityType, SemanticNode node) {
TextEntity newEntity = byTextRangeWithEngine(entity.getTextRange(), type, entityType, node, entity.getEngines()).orElseThrow(() -> new NotFoundException(
@ -714,14 +1133,27 @@ public class EntityCreationService {
}
public void insertToKieSession(TextEntity mergedEntity) {
/**
* Inserts a text entity into the kieSession for further processing.
*
* @param textEntity The merged text entity to insert.
*/
public void insertToKieSession(TextEntity textEntity) {
if (kieSession != null) {
kieSession.insert(mergedEntity);
kieSession.insert(textEntity);
}
}
/**
* Creates a text entity based on a Named Entity Recognition (NER) entity.
*
* @param nerEntity The NER entity used for creating the text entity.
* @param entityType The entity's classification.
* @param semanticNode The semantic node related to the NER entity.
* @return A new {@link TextEntity} based on the NER entity.
*/
public TextEntity byNerEntity(NerEntities.NerEntity nerEntity, EntityType entityType, SemanticNode semanticNode) {
return byTextRangeWithEngine(nerEntity.textRange(), nerEntity.type(), entityType, semanticNode, Set.of(Engine.NER)).orElseThrow(() -> new NotFoundException(
@ -729,24 +1161,59 @@ public class EntityCreationService {
}
/**
* Creates a text entity based on a Named Entity Recognition (NER) entity, with a specified type.
*
* @param nerEntity The NER entity used for creating the text entity.
* @param type Type of the entity.
* @param entityType The entity's classification.
* @param semanticNode The semantic node related to the NER entity.
* @return A new {@link TextEntity} based on the NER entity.
*/
public TextEntity byNerEntity(NerEntities.NerEntity nerEntity, String type, EntityType entityType, SemanticNode semanticNode) {
return byTextRangeWithEngine(nerEntity.textRange(), type, entityType, semanticNode, Set.of(Engine.NER)).orElseThrow(() -> new NotFoundException("No entity present!"));
}
/**
* Optionally creates a text entity based on a Named Entity Recognition (NER) entity.
*
* @param nerEntity The NER entity used for creating the text entity.
* @param entityType The entity's classification.
* @param semanticNode The semantic node related to the NER entity.
* @return An {@link Optional} containing the new {@link TextEntity} based on the NER entity, or {@link Optional#empty()} if not created.
*/
public Optional<TextEntity> optionalByNerEntity(NerEntities.NerEntity nerEntity, EntityType entityType, SemanticNode semanticNode) {
return byTextRangeWithEngine(nerEntity.textRange(), nerEntity.type(), entityType, semanticNode, Set.of(Engine.NER));
}
/**
* Optionally creates a text entity based on a Named Entity Recognition (NER) entity, with a specified type.
*
* @param nerEntity The NER entity used for creating the text entity.
* @param type Type of the entity.
* @param entityType The entity's classification.
* @param semanticNode The semantic node related to the NER entity.
* @return An {@link Optional} containing the new {@link TextEntity} based on the NER entity, or {@link Optional#empty()} if not created.
*/
public Optional<TextEntity> optionalByNerEntity(NerEntities.NerEntity nerEntity, String type, EntityType entityType, SemanticNode semanticNode) {
return byTextRangeWithEngine(nerEntity.textRange(), type, entityType, semanticNode, Set.of(Engine.NER));
}
/**
* Combines multiple NER entities into a single text entity.
*
* @param nerEntities The collection of NER entities to combine.
* @param type The type for the combined entity.
* @param entityType The classification for the combined entity.
* @param semanticNode The semantic node related to these entities.
* @return A stream of combined {@link TextEntity} objects.
*/
public Stream<TextEntity> combineNerEntitiesToCbiAddressDefaults(NerEntities nerEntities, String type, EntityType entityType, SemanticNode semanticNode) {
return NerEntitiesAdapter.combineNerEntitiesToCbiAddressDefaults(nerEntities)
@ -756,12 +1223,25 @@ public class EntityCreationService {
}
/**
* Validates if a given text range within a text block represents a valid entity.
*
* @param textBlock The text block containing the text range.
* @param textRange The text range to validate.
* @return true if the text range represents a valid entity, false otherwise.
*/
public boolean isValidEntityTextRange(TextBlock textBlock, TextRange textRange) {
return textRange.length() > 0 && boundaryIsSurroundedBySeparators(textBlock, textRange);
}
/**
* Adds a text entity to its related semantic node and updates the document tree accordingly.
*
* @param entity The text entity to add.
* @param node The semantic node related to the entity.
*/
public void addEntityToGraph(TextEntity entity, SemanticNode node) {
DocumentTree documentTree = node.getDocumentTree();

View File

@ -18,6 +18,13 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class RedactionSearchUtility {
/**
* Checks if any part of a CharSequence matches a given regex pattern.
*
* @param charSequence The CharSequence to be searched.
* @param regexPattern The regex pattern to match against.
* @return true if any part of the CharSequence matches the regex pattern.
*/
public static boolean anyMatch(CharSequence charSequence, String regexPattern) {
Pattern pattern = Patterns.getCompiledPattern(regexPattern, false);
@ -25,6 +32,13 @@ public class RedactionSearchUtility {
}
/**
* Checks if any part of a CharSequence matches a given regex pattern, case-insensitive.
*
* @param charSequence The CharSequence to be searched.
* @param regexPattern The regex pattern to match against.
* @return true if any part of the CharSequence matches the regex pattern.
*/
public static boolean anyMatchIgnoreCase(CharSequence charSequence, String regexPattern) {
Pattern pattern = Patterns.getCompiledPattern(regexPattern, true);
@ -32,24 +46,53 @@ public class RedactionSearchUtility {
}
/**
* Checks if the entirety of a CharSequence exactly matches a given regex pattern.
*
* @param charSequence The CharSequence to be matched.
* @param regexPattern The regex pattern to match against.
* @return true if the CharSequence exactly matches the regex pattern.
*/
public static boolean exactMatch(CharSequence charSequence, String regexPattern) {
return charSequence.toString().matches(regexPattern);
}
/**
* Checks if any part of a TextBlock matches a given regex pattern, case-insensitive.
*
* @param textBlock The TextBlock to be searched.
* @param regexPattern The regex pattern to match against.
* @return true if any part of the TextBlock matches the regex pattern.
*/
public static boolean anyMatchIgnoreCase(TextBlock textBlock, String regexPattern) {
return anyMatchIgnoreCase(textBlock.getSearchText(), regexPattern);
}
/**
* Checks if any part of a TextBlock matches a given regex pattern.
*
* @param textBlock The TextBlock to be searched.
* @param regexPattern The regex pattern to match against.
* @return true if any part of the TextBlock matches the regex pattern.
*/
public static boolean anyMatch(TextBlock textBlock, String regexPattern) {
return anyMatch(textBlock.getSearchText(), regexPattern);
}
/**
* Finds the first TextRange in a given CharSequence that matches a regex pattern.
*
* @param regexPattern The regex pattern to match against.
* @param searchText The CharSequence to be searched.
* @return The first TextRange that matches the pattern.
* @throws IllegalArgumentException If no match is found.
*/
public static TextRange findFirstTextRange(String regexPattern, CharSequence searchText) {
Pattern pattern = Patterns.getCompiledPattern(regexPattern, false);
@ -61,6 +104,13 @@ public class RedactionSearchUtility {
}
/**
* Expands the end boundary of a TextEntity based on a subsequent regex match.
*
* @param entity The entity to expand.
* @param regexPattern The regex pattern used for expansion.
* @return The new end boundary index.
*/
public static int getExpandedEndByRegex(TextEntity entity, String regexPattern) {
int expandedEnd;
@ -74,6 +124,13 @@ public class RedactionSearchUtility {
}
/**
* Expands the start boundary of a TextEntity based on a subsequent regex match.
*
* @param entity The entity to expand.
* @param regexPattern The regex pattern used for expansion.
* @return The new end boundary index.
*/
public static int getExpandedStartByRegex(TextEntity entity, String regexPattern) {
int expandedStart;
@ -87,6 +144,14 @@ public class RedactionSearchUtility {
}
/**
* Identifies all lines within a text block that fall within a specified vertical range.
*
* @param maxY The maximum Y-coordinate of the vertical range.
* @param minY The minimum Y-coordinate of the vertical range.
* @param textBlock The text block containing the lines to be checked.
* @return A {@link TextRange} encompassing all lines within the specified Y-coordinate range.
*/
public static TextRange findTextRangesOfAllLinesInYRange(double maxY, double minY, TextBlock textBlock) {
List<TextRange> lineBoundaries = IntStream.range(0, textBlock.numberOfLines()).boxed()
@ -107,6 +172,13 @@ public class RedactionSearchUtility {
}
/**
* Finds TextRanges matching a regex pattern within a TextBlock.
*
* @param regexPattern The regex pattern to match against.
* @param textBlock The TextBlock to search within.
* @return A list of TextRanges corresponding to regex matches.
*/
public static List<TextRange> findTextRangesByRegex(String regexPattern, TextBlock textBlock) {
Pattern pattern = Patterns.getCompiledPattern(regexPattern, false);
@ -115,6 +187,14 @@ public class RedactionSearchUtility {
}
/**
* Finds TextRanges matching a regex pattern within a TextBlock capturing groups.
*
* @param regexPattern The regex pattern to match against.
* @param group The group to capture within the regex pattern.
* @param textBlock The TextBlock to search within.
* @return A list of TextRanges corresponding to regex matches.
*/
public static List<TextRange> findTextRangesByRegex(String regexPattern, int group, TextBlock textBlock) {
Pattern pattern = Patterns.getCompiledPattern(regexPattern, false);
@ -122,6 +202,14 @@ public class RedactionSearchUtility {
}
/**
* Finds text ranges that match a regex pattern with consideration for line breaks within a text block.
*
* @param regexPattern The regex pattern to search for, allowing for multiline matches.
* @param group The regex pattern group to extract from matches.
* @param textBlock The text block to search within.
* @return A list of {@link TextRange} objects corresponding to the matches found.
*/
public static List<TextRange> findTextRangesByRegexWithLineBreaks(String regexPattern, int group, TextBlock textBlock) {
Pattern pattern = Patterns.getCompiledMultilinePattern(regexPattern, false);
@ -129,6 +217,13 @@ public class RedactionSearchUtility {
}
/**
* Finds text ranges within a text block that match a given regex pattern, case-insensitive.
*
* @param regexPattern The regex pattern to search for, with case-insensitive matching.
* @param textBlock The text block to search within.
* @return A list of {@link TextRange} objects corresponding to the matches found.
*/
public static List<TextRange> findTextRangesByRegexWithLineBreaksIgnoreCase(String regexPattern, int group, TextBlock textBlock) {
Pattern pattern = Patterns.getCompiledMultilinePattern(regexPattern, true);
@ -136,6 +231,13 @@ public class RedactionSearchUtility {
}
/**
* Finds text ranges within a text block that match a given regex pattern, and case-insensitive.
*
* @param regexPattern The regex pattern to search for.
* @param textBlock The text block to search within.
* @return A list of {@link TextRange} objects corresponding to the group matches found, with case-insensitive matching.
*/
public static List<TextRange> findTextRangesByRegexIgnoreCase(String regexPattern, TextBlock textBlock) {
Pattern pattern = Patterns.getCompiledPattern(regexPattern, true);
@ -143,6 +245,14 @@ public class RedactionSearchUtility {
}
/**
* Finds text ranges within a text block that match a given regex pattern, capturing a specific group, and case-insensitive.
*
* @param regexPattern The regex pattern to search for.
* @param group The group within the regex pattern to capture.
* @param textBlock The text block to search within.
* @return A list of {@link TextRange} objects corresponding to the group matches found, with case-insensitive matching.
*/
public static List<TextRange> findTextRangesByRegexIgnoreCase(String regexPattern, int group, TextBlock textBlock) {
Pattern pattern = Patterns.getCompiledPattern(regexPattern, true);
@ -173,6 +283,13 @@ public class RedactionSearchUtility {
}
/**
* Finds all occurrences of a specified string within a text block and returns their positions as text ranges.
*
* @param searchString The string to search for within the text block.
* @param textBlock The text block to search within.
* @return A list of {@link TextRange} objects representing the start and end positions of each occurrence of the search string.
*/
public static List<TextRange> findTextRangesByString(String searchString, TextBlock textBlock) {
List<TextRange> boundaries = new LinkedList<>();
@ -183,6 +300,13 @@ public class RedactionSearchUtility {
}
/**
* Finds all occurrences of a specified string within a text block, case-insensitive, and returns their positions as text ranges.
*
* @param searchString The string to search for within the text block, case-insensitively.
* @param textBlock The text block to search within.
* @return A list of {@link TextRange} objects representing the start and end positions of each occurrence of the search string, case-insensitive.
*/
public static List<TextRange> findTextRangesByStringIgnoreCase(String searchString, TextBlock textBlock) {
Pattern pattern = Pattern.compile(Pattern.quote(searchString), Pattern.CASE_INSENSITIVE);
@ -190,6 +314,13 @@ public class RedactionSearchUtility {
}
/**
* Searches a text block for all occurrences of each string in a list and returns their positions as text ranges.
*
* @param searchList A list of strings to search for within the text block.
* @param textBlock The text block to search within.
* @return A list of {@link TextRange} objects representing the start and end positions of occurrences of each string in the list.
*/
public static List<TextRange> findTextRangesByList(List<String> searchList, TextBlock textBlock) {
List<TextRange> boundaries = new LinkedList<>();
@ -200,6 +331,13 @@ public class RedactionSearchUtility {
}
/**
* Searches a text block for all occurrences of each string in a list, case-insensitive, and returns their positions as text ranges.
*
* @param searchList A list of strings to search for within the text block, case-insensitively.
* @param textBlock The text block to search within.
* @return A list of {@link TextRange} objects representing the start and end positions of occurrences of each string in the list, case-insensitive.
*/
public static List<TextRange> findTextRangesByListIgnoreCase(List<String> searchList, TextBlock textBlock) {
List<TextRange> boundaries = new LinkedList<>();

View File

@ -12,14 +12,12 @@ import java.util.Collection;
import java.util.stream.Stream;
import java.util.Optional;
import com.iqser.red.service.redaction.v1.server.model.document.*;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.entity.*;
import com.iqser.red.service.redaction.v1.server.model.document.entity.IEntity;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity
import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule
import com.iqser.red.service.redaction.v1.server.model.document.nodes.*;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
@ -31,14 +29,8 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SectionIdentifier;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.*;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.ConcatenatedTextBlock;
import com.iqser.red.service.redaction.v1.server.model.NerEntities;
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryModel;

View File

@ -12,14 +12,12 @@ import java.util.Collection;
import java.util.stream.Stream;
import java.util.Optional;
import com.iqser.red.service.redaction.v1.server.model.document.*;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.entity.*;
import com.iqser.red.service.redaction.v1.server.model.document.entity.IEntity;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity
import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule
import com.iqser.red.service.redaction.v1.server.model.document.nodes.*;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
@ -31,14 +29,8 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SectionIdentifier;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.*;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.ConcatenatedTextBlock;
import com.iqser.red.service.redaction.v1.server.model.NerEntities;
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryModel;

View File

@ -12,14 +12,12 @@ import java.util.Collection;
import java.util.stream.Stream;
import java.util.Optional;
import com.iqser.red.service.redaction.v1.server.model.document.*;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.entity.*;
import com.iqser.red.service.redaction.v1.server.model.document.entity.IEntity;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity
import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule
import com.iqser.red.service.redaction.v1.server.model.document.nodes.*;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
@ -31,14 +29,8 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SectionIdentifier;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.*;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.ConcatenatedTextBlock;
import com.iqser.red.service.redaction.v1.server.model.NerEntities;
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryModel;