diff --git a/redaction-service-v1/redaction-service-server-v1/build.gradle.kts b/redaction-service-v1/redaction-service-server-v1/build.gradle.kts index bc2f9899..88b92e6c 100644 --- a/redaction-service-v1/redaction-service-server-v1/build.gradle.kts +++ b/redaction-service-v1/redaction-service-server-v1/build.gradle.kts @@ -65,6 +65,8 @@ dependencies { implementation("org.reflections:reflections:0.10.2") + implementation("com.joestelmach:natty:0.13") + testImplementation(project(":rules-management")) testImplementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}") testImplementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}") diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateConverter.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateConverter.java index d9d70147..8d7435a7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateConverter.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateConverter.java @@ -11,7 +11,9 @@ import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatterBuilder; import java.time.format.DateTimeParseException; import java.time.format.ResolverStyle; +import java.util.Arrays; import java.util.Date; +import java.util.List; import java.util.Locale; import java.util.Objects; import java.util.Optional; @@ -27,6 +29,7 @@ import lombok.extern.slf4j.Slf4j; public class DateConverter { private static DateTimeFormatter DATE_TIME_FORMATTER; + private static final List LOCALES = Arrays.asList(Locale.UK, Locale.US); public Optional parseDate(String dateAsString) { @@ -34,15 +37,29 @@ public class DateConverter { DateTimeFormatter formatter = getDateTimeFormatter(); String cleanDate = dateAsString.trim(); cleanDate = removeTrailingDot(cleanDate); - try { - LocalDate localDate = LocalDate.parse(cleanDate, formatter); - Date date = Date.from(localDate.atStartOfDay(ZoneId.systemDefault()).toInstant()); - return Optional.of(date); - } catch (DateTimeParseException e) { - log.warn("Failed to parse date: {}", cleanDate); - return Optional.empty(); + + for (Locale locale : LOCALES) { + try { + return convertToDate(locale, cleanDate, formatter); + } catch (DateTimeParseException e) { + try { + Optional extractedDate = DateExtractorNatty.extractDate(cleanDate); + if (extractedDate.isEmpty()) { + log.warn("Failed to extract a valid date from value: {}", cleanDate); + return Optional.empty(); + } else { + cleanDate = extractedDate.get(); + return convertToDate(locale, cleanDate, formatter); + } + } catch (DateTimeParseException exception) { + log.debug("Failed to parse date: {} with locale: {}", cleanDate, locale); + } + } } + log.warn("Failed to parse date: {}", cleanDate); + return Optional.empty(); + } @@ -53,6 +70,14 @@ public class DateConverter { } + private Optional convertToDate(Locale locale, String cleanDate, DateTimeFormatter formatter) { + + LocalDate localDate = LocalDate.parse(cleanDate, formatter.withLocale(locale)); + Date date = Date.from(localDate.atStartOfDay(ZoneId.systemDefault()).toInstant()); + return Optional.of(date); + } + + private DateTimeFormatter getDateTimeFormatter() { if (DATE_TIME_FORMATTER == null) { @@ -65,13 +90,17 @@ public class DateConverter { private DateTimeFormatter createFormatterFromResource() { DateTimeFormatterBuilder builder = new DateTimeFormatterBuilder(); + builder.parseCaseInsensitive(); try (BufferedReader reader = new BufferedReader(new InputStreamReader(Objects.requireNonNull(DateConverter.class.getResourceAsStream("/date_formats.txt"))))) { String line; while ((line = reader.readLine()) != null) { - builder.appendOptional(DateTimeFormatter.ofPattern(line.trim(), Locale.UK)); + String pattern = line.trim(); + if (!pattern.isEmpty()) { + builder.appendOptional(DateTimeFormatter.ofPattern(pattern, Locale.UK)); + } } } catch (IOException e) { - throw new RuntimeException("Error reading date format file: " + e.getMessage()); + throw new RuntimeException("Error reading date format file: " + e.getMessage(), e); } return builder.toFormatter().withResolverStyle(ResolverStyle.SMART).withLocale(Locale.UK); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateExtractorNatty.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateExtractorNatty.java new file mode 100644 index 00000000..efecbadf --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateExtractorNatty.java @@ -0,0 +1,26 @@ +package com.iqser.red.service.redaction.v1.server.utils; + +import java.util.List; +import java.util.Optional; + +import com.joestelmach.natty.DateGroup; +import com.joestelmach.natty.Parser; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class DateExtractorNatty { + + public Optional extractDate(String text) { + + Parser parser = new Parser(); + List groups = parser.parse(text); + if (!groups.isEmpty()) { + DateGroup group = groups.get(0); + String dateText = group.getText(); + return Optional.of(dateText); + } + return Optional.empty(); + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/resources/date_formats.txt b/redaction-service-v1/redaction-service-server-v1/src/main/resources/date_formats.txt index c8725365..5ceea298 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/resources/date_formats.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/main/resources/date_formats.txt @@ -1,6 +1,7 @@ dd-MMM-yyyy dd MMM yyyy dd MMM yy +d MMM yy dd MM yyyy dd MMMM yyyy MMMM dd, yyyy @@ -34,5 +35,6 @@ dd.MMM.yy dd.MMMM.yy dd.MMM-yyyy dd.MMMM-yyyy +dd. MMM yy d['th']['st']['nd']['rd'] MMMM yy d['th']['st']['nd']['rd'] MMMM yyyy \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/date/DateConverterTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/date/DateConverterTest.java new file mode 100644 index 00000000..9ae99c38 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/date/DateConverterTest.java @@ -0,0 +1,54 @@ +package com.iqser.red.service.redaction.v1.server.date; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Arrays; +import java.util.Date; +import java.util.List; +import java.util.Optional; + +import org.junit.jupiter.api.Test; + +import com.iqser.red.service.redaction.v1.server.utils.DateConverter; + +public class DateConverterTest { + + @Test + public void testDateConverter() { + + List goldenStandardDates = Arrays.asList("3 Jun 08", + "09. Apr 09", + "07-Sep-2010", + "26-FEB-2008", + "30-APR-2008", + "30-apr-2008", + "30-Apr-2008", + "1 Apr 08", + "26-FEB-2008", + "19-MAR-2008", + "1 Apr 08", + "27-MAR-2008", + "06-MAY-2008", + "3 Apr 08", + "12-MAR-2008", + "08-APR-2008", + "1 Apr 08", + "4 Apr 08", + "13 November 2017 (animal 1)", + "16 November 2017 (animal 1)", + "27 March 2018 (animal 1 - 5000 mg/kg bw)", + "10 April 2018 (animal 1 - 5000 mg/kg bw)", + "13 November 2017 (animal 1)", + "16 November 2017 (animal 1)", + "28 March 2018 (animal 1 - 5000 mg/kg bw)", + "28 March 2018 (animal1 - 5000 mg/kg bw)", + "28 August 2018 (animal 1)", + "31 August 2018 (animal 1)"); + + for (String dateStr : goldenStandardDates) { + Optional parsedDate = DateConverter.parseDate(dateStr); + assertTrue(parsedDate.isPresent(), "Failed to parse date: " + dateStr); + } + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/date/DateExtractorNattyTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/date/DateExtractorNattyTest.java new file mode 100644 index 00000000..93dc19ed --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/date/DateExtractorNattyTest.java @@ -0,0 +1,28 @@ +package com.iqser.red.service.redaction.v1.server.date; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Optional; + +import org.junit.jupiter.api.Test; + +import com.iqser.red.service.redaction.v1.server.utils.DateExtractorNatty; + +public class DateExtractorNattyTest { + + @Test + public void testExtractDate() { + + String[] testStrings = {"13 November 2017 (animal 1)", "16 November 2017 (animal 1)", "27 March 2018 (animal 1 - 5000 mg/kg bw)", "10 April 2018 (animal 1 - 5000 mg/kg bw)", "13 November 2017 (animal 1)", "16 November 2017 (animal 1)", "28 March 2018 (animal 1 - 5000 mg/kg bw)", "28 March 2018 (animal1 - 5000 mg/kg bw)", "28 August 2018 (animal 1)", "31 August 2018 (animal 1)"}; + + String[] expectedDates = {"13 November 2017", "16 November 2017", "27 March 2018", "10 April 2018", "13 November 2017", "16 November 2017", "28 March 2018", "28 March 2018", "28 August 2018", "31 August 2018"}; + + for (int i = 0; i < testStrings.length; i++) { + Optional extractedDate = DateExtractorNatty.extractDate(testStrings[i]); + assertTrue(extractedDate.isPresent(), "No date found in: " + testStrings[i]); + assertEquals(expectedDates[i], extractedDate.get(), "Failed to extract correct date from: " + testStrings[i]); + } + } + +} \ No newline at end of file