diff --git a/drools-prompt/EntityCreationService_doc b/drools-prompt/EntityCreationService_doc new file mode 100644 index 00000000..1b2d7667 --- /dev/null +++ b/drools-prompt/EntityCreationService_doc @@ -0,0 +1,124 @@ +/** +* Searches the provided SemanticNode for the keyword and creates an Entity for each occurrence. +* @param keyword the string to search for +* @param type The type of the RedactionEntity to be created +* @param entityType The EntityType of the RedactionEntity to be created +* @param node The SemanticNode to search in +* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType +*/ +public Stream byString(String keyword, String type, EntityType entityType, SemanticNode node) +/** +* Same as byString, but case insensitive. +*/ +public Stream byStringIgnoreCase(String keyword, String type, EntityType entityType, SemanticNode node) +/** +* Searches the provided SemanticNode with the regexPattern and creates a new RedactionEntity with the provided group for each occurrence. +* @param regexPattern The regexPattern +* @param type The type of the RedactionEntity to be created +* @param entityType The EntityType of the RedactionEntity to be created +* @param group the regexPattern group, that should be the entity +* @param node The SemanticNode to search in +* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType +*/ +public Stream byRegex(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) +/** +* Same as byRegex, but case insensitive. +*/ +public Stream byRegexIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) +/** +* Same as byRegex, but can handle patterns with linebreaks. +*/ +public Stream byRegexWithLineBreaksIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) +/** +* Same as byRegexWithLineBreaks, but case insensitive. +*/ +public Stream byRegexWithLineBreaksIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) +/** +* Finds the provided string, and creates a new RedactionEntity from the text after until the end of the line it is found in. +* @param string The keyword to search for +* @param type The type of the RedactionEntity to be created +* @param entityType The EntityType of the RedactionEntity to be created +* @param node The SemanticNode to search in +* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType +*/ +public Stream lineAfterString(String string, String type, EntityType entityType, SemanticNode node) +/** +* Same as lineAfterString, but with multiple keywords +*/ +public Stream lineAfterStrings(List strings, String type, EntityType entityType, SemanticNode node) +/** +* Finds the provided string in a TableCell, and creates a new RedactionEntity in the same line but adjacent table cells to the right. +* @param string The keyword to search for +* @param type The type of the RedactionEntity to be created +* @param entityType The EntityType of the RedactionEntity to be created +* @param table The TableNode to search in +* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType +*/ +public Stream lineAfterStringAcrossColumns(String string, String type, EntityType entityType, TableNode table) +/** +* Creates a redaction entity based on the given boundary, type, entity type, and semantic node. +* +* @param boundary The boundary of the redaction entity. +* @param type The type of the redaction entity. +* @param entityType The entity type of the redaction entity. +* @param node The semantic node where the boundary is. +* @return An Optional containing the new redaction entity. +*/ +public Optional byBoundary(Boundary boundary, String type, EntityType entityType, SemanticNode node) +/** +* Creates new RedactionEntities between the provided start and stop boundaries. The start and stop boundaries are excluded. +* If any boundaries of the new RedactionEntities overlap, only the shortest boundary will be used. +* @param startBoundaries List of start boundaries +* @param stopBoundaries List of stop boundaries +* @param type The type of the redaction entity. +* @param entityType The entity type of the redaction entity. +* @param node The semantic node where the boundaries are. +* @return A Stream of new RedactionEntities between the start and stop boundaries +*/ +public Stream betweenBoundaries(List startBoundaries, List stopBoundaries, String type, EntityType entityType, SemanticNode node) +/** +* Same as betweenBoundaries, but it creates the start and stop boundaries by performing a text search on the provided SemanticNode. +*/ +public Stream betweenStrings(String start, String stop, String type, EntityType entityType, SemanticNode node) +/** +* Same as betweenStrings, but case insensitive. +*/ +public Stream betweenStringsIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) +/** +* These 6 functions work the same as betweenStrings, but they also include the start and/or stop strings or are case insensitive, depending on their name. +*/ +public Stream betweenStringsIncludeStart(String start, String stop, String type, EntityType entityType, SemanticNode node) +public Stream betweenStringsIncludeStartIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) +public Stream betweenStringsIncludeEnd(String start, String stop, String type, EntityType entityType, SemanticNode node) +public Stream betweenStringsIncludeEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) +public Stream betweenStringsIncludeStartAndEnd(String start, String stop, String type, EntityType entityType, SemanticNode node) +public Stream betweenStringsIncludeStartAndEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) +/** +* Same as betweenBoundaries, but it creates the start and stop boundaries by performing a regex search on the provided SemanticNode. +*/ +public Stream betweenRegexes(String regexStart, String regexStop, String type, EntityType entityType, SemanticNode node) +/** +* Same as betweenRegexes, but case insensitive. +*/ +public Stream betweenRegexesIgnoreCase(String regexStart, String regexStop, String type, EntityType entityType, SemanticNode node) +/** +* Creates a new RedactionEntity which has the same boundary as the provided SemanticNode. +* @param node The SemanticNode to create a new RedactionEntity from. +* @param type The type of the redaction entity. +* @param entityType The entity type of the redaction entity. +* @return An optional RedactionEntity. Is empty, if the provided SemanticNode is empty. +*/ +public Optional bySemanticNode(SemanticNode node, String type, EntityType entityType) +/** +* Same as bySemanticNode, but ignores the SemanticNode, if its not a Paragraph and all its child SemanticNodes, that are not Paragraphs. +*/ +public Stream bySemanticNodeParagraphsOnly(SemanticNode node, String type, EntityType entityType) +/** +* Searches the provided SemanticNode for the provided string, and creates a new RedactionEntity, from the end of the first occurrence of the string until the end of the SemanticNode. +* @param string The string to search for +* @param type The type of the redaction entity. +* @param entityType The entity type of the redaction entity. +* @param node The SemanticNode to use and search in +* @return An optional RedactionEntity, is empty, if the SemanticNode is empty, or the string isn't found in the SemanticNode. +*/ +public Optional semanticNodeAfterString(String string, String type, EntityType entityType, SemanticNode node) \ No newline at end of file diff --git a/drools-prompt/RedactionEntity_doc b/drools-prompt/RedactionEntity_doc new file mode 100644 index 00000000..b87a269f --- /dev/null +++ b/drools-prompt/RedactionEntity_doc @@ -0,0 +1,33 @@ +/** +* Sets the Entity to applied, this is the default. +* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name +* @param reason Should describe the intention of the rule in a few words +* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null +*/ +void apply(String ruleIdentifier, String reason, String legalBasis) +/** +* Same as apply, but legalBasis can be null. +*/ +void force(String ruleIdentifier, String reason, String legalBasis) +/** +* Sets the Entity to not applied. +* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name +* @param reason Should describe the intention of the rule in a few words +* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null +*/ +void skip(String ruleIdentifier, String reason, String legalBasis) +/** +* Sets the Entity to ignored, is preferred to remove in most cases. +* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name +* @param reason Should describe the intention of the rule in a few words +* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null +*/ +void ignore(String ruleIdentifier, String reason, String legalBasis) +/** +* Removes the entity entirely, also removes it from all EntitySets and sets intersectingNodes and deepestFullyContainingNode to null. +* Should only be used in a few cases! +* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name +* @param reason Should describe the intention of the rule in a few words +* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null +*/ +void remove(String ruleIdentifier, String reason, String legalBasis) \ No newline at end of file diff --git a/drools-prompt/RedactionEntity_properties_doc b/drools-prompt/RedactionEntity_properties_doc new file mode 100644 index 00000000..2371f709 --- /dev/null +++ b/drools-prompt/RedactionEntity_properties_doc @@ -0,0 +1,39 @@ +/** +* A boundary has a start and end property accessible by start() and end() respectively. It marks the start and end offset in String offsets. +*/ +Boundary boundary +/** +* The type of an Entity identifies groups of Entities. Examples for Entites we want to find are "CBI_author" for authors, "CBI_address" for addresses or "PII" for personally identifiable information. +* Other types include helper types, which interact with the main Entities, for example "published_information" or "vertebrate". These Entities are used to modify the main Entities, if they occur in the same Section. +* A typical example would be to ignore Entities of type "CBI_author", if they occur in the same Section as "published_information" entities. +*/ +String type +/** +* The EntityType can be one of four different values: ENTITY, RECOMMENDATION, FALSE_POSITIVE, FALSE_RECOMMENDATION. +* If an ENTITY is overlapped by a FALSE_POSITIVE, the ENTITY is removed. If a RECOMMENDATION is overlapped by either an ENTITY or FALSE_RECOMMENDATION, it is removed. +*/ +EntityType entityType +/** +* The text the Entity represents. +*/ +String value +/** +* Up to three words after the Entity in the text. +*/ +String textAfter +/** +* Up to three words before the Entity in the text. +*/ +String textBefore +/** +* All pages whose TextBlock intersects the boundary of this entity. Is always equal to the Pages which have this RedactionEntity in their EntitySet. +*/ +Set pages +/** +* All SemanticNodes whose TextBlock intersects the boundary of this entity. Is always equal to the SemanticNodes which have this RedactionEntity in their EntitySet. +*/ +List intersectingNodes +/** +* The SemanticNode which is the deepest in the Tree structure and whose TextBlock fully contains the boundary of this Node. +*/ +SemanticNode deepestFullyContainingNode \ No newline at end of file diff --git a/drools-prompt/Section_doc b/drools-prompt/Section_doc index 0c2a86af..257f0b87 100644 --- a/drools-prompt/Section_doc +++ b/drools-prompt/Section_doc @@ -1,6 +1,6 @@ - /** - * Determines whether this Section has any tables. - * - * @return {@code true} if there are tables, {@code false} otherwise - */ - public boolean hasTables() \ No newline at end of file +/** +* Determines whether this Section has any tables. +* +* @return {@code true} if there are tables, {@code false} otherwise +*/ +public boolean hasTables() \ No newline at end of file diff --git a/drools-prompt/SemanticNode_doc b/drools-prompt/SemanticNode_doc index c7410f23..a3bc4759 100644 --- a/drools-prompt/SemanticNode_doc +++ b/drools-prompt/SemanticNode_doc @@ -1,11 +1,9 @@ - /** * Returns the type of this node, such as NodeType.SECTION, NodeType.PARAGRAPH, etc. * * @return NodeType of this node */ NodeType getType(); - /** * Any Node maintains its own Set of Entities. * This Set contains all Entities whose boundary intersects the boundary of this node. @@ -14,14 +12,12 @@ NodeType getType(); * @return Set of all Entities associated with this Node */ Set getEntities(); - /** * Returns all Pages this SemanticNode is associated with. * * @return Set of Pages this node appears on. */ Set getPages() - /** * Checks if this node appears on the specified page number. * @@ -29,20 +25,17 @@ Set getPages() * @return True if this node is found on the specified page number, false otherwise. */ boolean isOnPage(int pageNumber) - /** * Returns the closest Headline associated with this SemanticNode * * @return First Headline found. */ Headline getHeadline() - /** * @return The SemanticNode representing the Parent in the DocumentTree * throws NotFoundException, when no parent is present */ SemanticNode getParent() - /** * Checks whether this SemanticNode has any Entity of the provided type. * Ignores Entity with ignored == true or removed == true. @@ -51,7 +44,6 @@ SemanticNode getParent() * @return true, if this SemanticNode has at least one Entity of the provided type */ boolean hasEntitiesOfType(String type) - /** * Checks whether this SemanticNode has any Entity of the provided types. * Ignores Entity with ignored == true or removed == true. @@ -60,7 +52,6 @@ boolean hasEntitiesOfType(String type) * @return true, if this SemanticNode has at least one Entity of any of the provided types */ boolean hasEntitiesOfAnyType(String... types) - /** * Checks whether this SemanticNode has at least one Entity of each of the provided types. * Ignores Entity with ignored == true or removed == true. @@ -69,7 +60,6 @@ boolean hasEntitiesOfAnyType(String... types) * @return true, if this SemanticNode has at least one Entity of each of the provided types */ boolean hasEntitiesOfAllTypes(String... types) - /** * Returns a List of Entities in this SemanticNode which are of the provided type such as "CBI_author". * Ignores Entity with ignored == true or removed == true. @@ -78,7 +68,6 @@ boolean hasEntitiesOfAllTypes(String... types) * @return List of RedactionEntities of any the type */ List getEntitiesOfType(String type) - /** * Returns a List of Entities in this SemanticNode which have any of the provided types such as "CBI_author". * Ignores Entity with ignored == true or removed == true. @@ -87,7 +76,6 @@ List getEntitiesOfType(String type) * @return List of RedactionEntities of any provided type */ List getEntitiesOfType(List types) - /** * Returns a List of Entities in this SemanticNode which have any of the provided types. * Ignores Entity with the ignored flag set to true or the removed flag set to true. @@ -96,7 +84,6 @@ List getEntitiesOfType(List types) * @return List of RedactionEntities that match any of the provided types */ List getEntitiesOfType(String... types) - /** * Checks whether this SemanticNode contains the provided String. * @@ -104,7 +91,6 @@ List getEntitiesOfType(String... types) * @return true, if this node's TextBlock contains the string */ boolean containsString(String string) - /** * Checks whether this SemanticNode contains all the provided Strings. * @@ -112,7 +98,6 @@ boolean containsString(String string) * @return true, if this node's TextBlock contains all strings */ boolean containsAllStrings(String... strings) - /** * Checks whether this SemanticNode contains any of the provided Strings. * @@ -127,7 +112,6 @@ boolean containsAnyString(String... strings) * @return true, if this node's TextBlock contains the string ignoring case */ boolean containsStringIgnoreCase(String string) - /** * Checks whether this SemanticNode contains any of the provided Strings ignoring case. * @@ -135,7 +119,6 @@ boolean containsStringIgnoreCase(String string) * @return true, if this node's TextBlock contains any of the strings */ boolean containsAnyStringIgnoreCase(String... strings) - /** * Checks whether this SemanticNode contains any of the provided Strings ignoring case. * @@ -143,7 +126,6 @@ boolean containsAnyStringIgnoreCase(String... strings) * @return true, if this node's TextBlock contains any of the strings */ boolean containsAllStringsIgnoreCase(String... strings) - /** * Checks whether this SemanticNode matches the provided regex pattern. * @@ -151,7 +133,6 @@ boolean containsAllStringsIgnoreCase(String... strings) * @return true, if this node's TextBlock contains the regex pattern */ boolean matchesRegex(String regexPattern) - /** * Checks whether this SemanticNode matches the provided regex pattern ignoring case. * @@ -159,14 +140,12 @@ boolean matchesRegex(String regexPattern) * @return true, if this node's TextBlock contains the regex pattern ignoring case */ boolean matchesRegexIgnoreCase(String regexPattern) - /** * Streams all children located directly underneath this node in the DocumentTree. * * @return Stream of all children */ Stream streamChildren() - /** * Streams all children located directly underneath this node in the DocumentTree of the provided type. * @@ -174,14 +153,12 @@ Stream streamChildren() * @return Stream of all children of the provided type */ Stream streamChildrenOfType(NodeType nodeType) - /** * Recursively streams all SemanticNodes located underneath this node in the DocumentTree in order. * * @return Stream of all SubNodes */ Stream streamAllSubNodes() - /** * Recursively streams all SemanticNodes of a specified type located underneath this node in the DocumentTree in order. * @@ -189,14 +166,12 @@ Stream streamAllSubNodes() * @return a Stream of all SubNodes of the specified type */ Stream streamAllSubNodesOfType(NodeType nodeType) - /** * The Boundary is the start and end string offsets in the reading order of the document. * * @return Boundary of this Node's TextBlock */ Boundary getBoundary() - /** * The SectionIdentifier uses the numeric identifiers of Headlines to infer a tree structure. * It implements functions such as sectionIdentifier.isChildOf(otherSectionIdentifier) and sectionIdentifier.isParentOf(otherSectionIdentifier) diff --git a/drools-prompt/Table_doc b/drools-prompt/Table_doc index c553d04f..486577af 100644 --- a/drools-prompt/Table_doc +++ b/drools-prompt/Table_doc @@ -5,7 +5,6 @@ * @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings */ Stream streamEntitiesWhereRowContainsStringsIgnoreCase(List strings) - /** * Checks whether the specified row contains all the provided strings. * @@ -14,7 +13,6 @@ Stream streamEntitiesWhereRowContainsStringsIgnoreCase(List strings) - /** * Streams all entities which appear in a row where at least one cell has the provided header and the provided value. * @@ -23,7 +21,6 @@ boolean rowContainsStringsIgnoreCase(Integer row, List strings) * @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value. */ Stream streamEntitiesWhereRowHasHeaderAndValue(String header, String value) - /** * Streams all entities which appear in a row where at least one cell has the provided header and any provided value. * @@ -32,7 +29,6 @@ Stream streamEntitiesWhereRowHasHeaderAndValue(String header, S * @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value. */ Stream streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List values) - /** * Streams all entities in this table, that appear in a row, which contains at least one entity with any of the provided types. * Ignores Entity with ignored == true or removed == true. @@ -41,7 +37,6 @@ Stream streamEntitiesWhereRowHasHeaderAndAnyValue(String header * @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types. */ Stream streamEntitiesWhereRowContainsEntitiesOfType(List types) - /** * Streams all entities in this table, that appear in a row, which contains no entity of any of the provided types. * Ignores Entity with ignored == true or removed == true. @@ -50,14 +45,12 @@ Stream streamEntitiesWhereRowContainsEntitiesOfType(List streamEntitiesWhereRowContainsNoEntitiesOfType(List types) - /** * Streams all TableCells in this Table which have the provided header row-wise. * * @return Stream of all TableCells which have the provided header */ Stream streamTableCellsWithHeader(String header) - /** * Streams all Headers and checks if any equal the provided string. * @@ -65,7 +58,6 @@ Stream streamTableCellsWithHeader(String header) * @return true, if at least one header equals the provided string */ boolean hasHeader(String header) - /** * Checks if this table has a column with the provided header and any of the table cells in that column contain the provided value. * @@ -74,7 +66,6 @@ boolean hasHeader(String header) * @return true, if this table has a column with the provided header and any of the table cells in that column contain the provided value */ boolean hasRowWithHeaderAndValue(String header, String value) - /** * Finds all entities of the provided type, which appear in the same row that the provided entity appears in. * Ignores Entity with ignored == true or removed == true. diff --git a/drools-prompt/drools-prompt b/drools-prompt/drools-prompt index 79f9831e..426b041a 100644 --- a/drools-prompt/drools-prompt +++ b/drools-prompt/drools-prompt @@ -18,6 +18,14 @@ The Section, Table, TableCell, Paragraph, and Headline implement a common interf - Sections may have any child except TableCells, but if it contains Paragraphs as well as Tables, it is split into a Section with multiple Sections as children, where any child Section only contains either Tables or Paragraphs. Further, if the first SemanticNode is a Headline it remains the first child in the Parent Section, before any subsections. +The goal of the Software is to find pieces of Text that are relevant. Each piece of text is represented by a RedactionEntity. +The main pieces of relevant text are Text we want to redact. For example, we want to redact all Authors of a dossier. Or all personally identifiable information, such as E-Mails and Telephone Numbers. +RedactionEntities may also represent other pieces of text, such as published information, or certain species of vertebrates. +The RedactionEntities are part of the document structure, such that they are referenced in each SemanticNode and Page, which contains it, and further, the RedactionEntity references each Page and SemanticNode it occurs in. +So the same RedactionEntity occurs in the paragraph it is located, as well as all its parent Sections. + +Previous to the execution of rules, the Document structure is assembled and a text search is performed to create initial Entities of different types. They are then inserted into the document structure. +Then the KieSession is created and each SemanticNode and Entity is inserted into its working memory. ---------------------------------------------------------------- The relevant functions for SemanticNode: /** @@ -229,7 +237,7 @@ Boundary getBoundary() SectionIdentifier getSectionIdentifier() ---------------------------------------------------------------- -TheTable has the additional functions: +The Table has the additional functions: /** * Streams all entities in this table, that appear in a row, which contains any of the provided strings. * @@ -347,7 +355,213 @@ Set getEntities(); */ Integer getPageNumber(); ---------------------------------------------------------------- -The goal of the Rules is to find pieces of Text that we want to redact. These pieces of text are represented as RedactionEntities + +---------------------------------------------------------------- +The RedactionEntity has the following properties: +/** +* A boundary has a start and end property accessible by start() and end() respectively. It marks the start and end offset in String offsets. +*/ +Boundary boundary +/** +* The type of an Entity identifies groups of Entities. Examples for Entites we want to find are "CBI_author" for authors, "CBI_address" for addresses or "PII" for personally identifiable information. +* Other types include helper types, which interact with the main Entities, for example "published_information" or "vertebrate". These Entities are used to modify the main Entities, if they occur in the same Section. +* A typical example would be to ignore Entities of type "CBI_author", if they occur in the same Section as "published_information" entities. +*/ +String type +/** +* The EntityType can be one of four different values: ENTITY, RECOMMENDATION, FALSE_POSITIVE, FALSE_RECOMMENDATION. +* If an ENTITY is overlapped by a FALSE_POSITIVE, the ENTITY is removed. If a RECOMMENDATION is overlapped by either an ENTITY or FALSE_RECOMMENDATION, it is removed. +*/ +EntityType entityType +/** +* The text the Entity represents. +*/ +String value +/** +* Up to three words after the Entity in the text. +*/ +String textAfter +/** +* Up to three words before the Entity in the text. +*/ +String textBefore +/** +* All pages whose TextBlock intersects the boundary of this entity. Is always equal to the Pages which have this RedactionEntity in their EntitySet. +*/ +Set pages +/** +* All SemanticNodes whose TextBlock intersects the boundary of this entity. Is always equal to the SemanticNodes which have this RedactionEntity in their EntitySet. +*/ +List intersectingNodes +/** +* The SemanticNode which is the deepest in the Tree structure and whose TextBlock fully contains the boundary of this Node. +*/ +SemanticNode deepestFullyContainingNode + +The RedactionEntity also has the following methods: +/** +* Sets the Entity to applied, this is the default. +* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name +* @param reason Should describe the intention of the rule in a few words +* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null +*/ +void apply(String ruleIdentifier, String reason, String legalBasis) +/** +* Same as apply, but legalBasis can be null. +*/ +void force(String ruleIdentifier, String reason, String legalBasis) +/** +* Sets the Entity to not applied. +* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name +* @param reason Should describe the intention of the rule in a few words +* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null +*/ +void skip(String ruleIdentifier, String reason, String legalBasis) +/** +* Sets the Entity to ignored, is preferred to remove in most cases. +* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name +* @param reason Should describe the intention of the rule in a few words +* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null +*/ +void ignore(String ruleIdentifier, String reason, String legalBasis) +/** +* Removes the entity entirely, also removes it from all EntitySets and sets intersectingNodes and deepestFullyContainingNode to null. +* Should only be used in a few cases! +* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name +* @param reason Should describe the intention of the rule in a few words +* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null +*/ +void remove(String ruleIdentifier, String reason, String legalBasis) +---------------------------------------------------------------- +The EntityCreationService offers the following functions: +/** +* Searches the provided SemanticNode for the keyword and creates an Entity for each occurrence. +* @param keyword the string to search for +* @param type The type of the RedactionEntity to be created +* @param entityType The EntityType of the RedactionEntity to be created +* @param node The SemanticNode to search in +* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType +*/ +public Stream byString(String keyword, String type, EntityType entityType, SemanticNode node) +/** +* Same as byString, but case insensitive. +*/ +public Stream byStringIgnoreCase(String keyword, String type, EntityType entityType, SemanticNode node) +/** +* Searches the provided SemanticNode with the regexPattern and creates a new RedactionEntity with the provided group for each occurrence. +* @param regexPattern The regexPattern +* @param type The type of the RedactionEntity to be created +* @param entityType The EntityType of the RedactionEntity to be created +* @param group the regexPattern group, that should be the entity +* @param node The SemanticNode to search in +* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType +*/ +public Stream byRegex(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) +/** +* Same as byRegex, but case insensitive. +*/ +public Stream byRegexIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) +/** +* Same as byRegex, but can handle patterns with linebreaks. +*/ +public Stream byRegexWithLineBreaksIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) +/** +* Same as byRegexWithLineBreaks, but case insensitive. +*/ +public Stream byRegexWithLineBreaksIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) +/** +* Finds the provided string, and creates a new RedactionEntity from the text after until the end of the line it is found in. +* @param string The keyword to search for +* @param type The type of the RedactionEntity to be created +* @param entityType The EntityType of the RedactionEntity to be created +* @param node The SemanticNode to search in +* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType +*/ +public Stream lineAfterString(String string, String type, EntityType entityType, SemanticNode node) +/** +* Same as lineAfterString, but with multiple keywords +*/ +public Stream lineAfterStrings(List strings, String type, EntityType entityType, SemanticNode node) +/** +* Finds the provided string in a TableCell, and creates a new RedactionEntity in the same line but adjacent table cells to the right. +* @param string The keyword to search for +* @param type The type of the RedactionEntity to be created +* @param entityType The EntityType of the RedactionEntity to be created +* @param table The TableNode to search in +* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType +*/ +public Stream lineAfterStringAcrossColumns(String string, String type, EntityType entityType, TableNode table) +/** +* Creates a redaction entity based on the given boundary, type, entity type, and semantic node. +* +* @param boundary The boundary of the redaction entity. +* @param type The type of the redaction entity. +* @param entityType The entity type of the redaction entity. +* @param node The semantic node where the boundary is. +* @return An Optional containing the new redaction entity. +*/ +public Optional byBoundary(Boundary boundary, String type, EntityType entityType, SemanticNode node) +/** +* Creates new RedactionEntities between the provided start and stop boundaries. The start and stop boundaries are excluded. +* If any boundaries of the new RedactionEntities overlap, only the shortest boundary will be used. +* @param startBoundaries List of start boundaries +* @param stopBoundaries List of stop boundaries +* @param type The type of the redaction entity. +* @param entityType The entity type of the redaction entity. +* @param node The semantic node where the boundaries are. +* @return A Stream of new RedactionEntities between the start and stop boundaries +*/ +public Stream betweenBoundaries(List startBoundaries, List stopBoundaries, String type, EntityType entityType, SemanticNode node) +/** +* Same as betweenBoundaries, but it creates the start and stop boundaries by performing a text search on the provided SemanticNode. +*/ +public Stream betweenStrings(String start, String stop, String type, EntityType entityType, SemanticNode node) +/** +* Same as betweenStrings, but case insensitive. +*/ +public Stream betweenStringsIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) +/** +* These 6 functions work the same as betweenStrings, but they also include the start and/or stop strings or are case insensitive, depending on their name. +*/ +public Stream betweenStringsIncludeStart(String start, String stop, String type, EntityType entityType, SemanticNode node) +public Stream betweenStringsIncludeStartIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) +public Stream betweenStringsIncludeEnd(String start, String stop, String type, EntityType entityType, SemanticNode node) +public Stream betweenStringsIncludeEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) +public Stream betweenStringsIncludeStartAndEnd(String start, String stop, String type, EntityType entityType, SemanticNode node) +public Stream betweenStringsIncludeStartAndEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) +/** +* Same as betweenBoundaries, but it creates the start and stop boundaries by performing a regex search on the provided SemanticNode. +*/ +public Stream betweenRegexes(String regexStart, String regexStop, String type, EntityType entityType, SemanticNode node) +/** +* Same as betweenRegexes, but case insensitive. +*/ +public Stream betweenRegexesIgnoreCase(String regexStart, String regexStop, String type, EntityType entityType, SemanticNode node) +/** +* Creates a new RedactionEntity which has the same boundary as the provided SemanticNode. +* @param node The SemanticNode to create a new RedactionEntity from. +* @param type The type of the redaction entity. +* @param entityType The entity type of the redaction entity. +* @return An optional RedactionEntity. Is empty, if the provided SemanticNode is empty. +*/ +public Optional bySemanticNode(SemanticNode node, String type, EntityType entityType) +/** +* Same as bySemanticNode, but ignores the SemanticNode, if its not a Paragraph and all its child SemanticNodes, that are not Paragraphs. +*/ +public Stream bySemanticNodeParagraphsOnly(SemanticNode node, String type, EntityType entityType) +/** +* Searches the provided SemanticNode for the provided string, and creates a new RedactionEntity, from the end of the first occurrence of the string until the end of the SemanticNode. +* @param string The string to search for +* @param type The type of the redaction entity. +* @param entityType The entity type of the redaction entity. +* @param node The SemanticNode to use and search in +* @return An optional RedactionEntity, is empty, if the SemanticNode is empty, or the string isn't found in the SemanticNode. +*/ +public Optional semanticNodeAfterString(String string, String type, EntityType entityType, SemanticNode node) +---------------------------------------------------------------- +Rules may be grouped into two categories. +The first category changes existing RedactionEntities, and the second creates new RedactionEntities. + There are two different types of rules, one you create new Entities and in the other you change or remove existing Entities. An Entity is any piece of text, uniquely identified in the Document by its Boundary, its Type and its EntityType. The Boundary consists of a start and stop index in the text of the document. The Type is a String like "PII", which stands for