RED-9159 - Improve date conversion #404

Merged
andrei.isvoran.ext merged 1 commits from RED-9159 into master 2024-05-17 14:29:35 +02:00
6 changed files with 150 additions and 9 deletions

View File

@ -65,6 +65,8 @@ dependencies {
implementation("org.reflections:reflections:0.10.2")
implementation("com.joestelmach:natty:0.13")
testImplementation(project(":rules-management"))
testImplementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
testImplementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")

View File

@ -11,7 +11,9 @@ import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.time.format.DateTimeParseException;
import java.time.format.ResolverStyle;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
@ -27,6 +29,7 @@ import lombok.extern.slf4j.Slf4j;
public class DateConverter {
private static DateTimeFormatter DATE_TIME_FORMATTER;
private static final List<Locale> LOCALES = Arrays.asList(Locale.UK, Locale.US);
public Optional<Date> parseDate(String dateAsString) {
@ -34,15 +37,29 @@ public class DateConverter {
DateTimeFormatter formatter = getDateTimeFormatter();
String cleanDate = dateAsString.trim();
cleanDate = removeTrailingDot(cleanDate);
try {
LocalDate localDate = LocalDate.parse(cleanDate, formatter);
Date date = Date.from(localDate.atStartOfDay(ZoneId.systemDefault()).toInstant());
return Optional.of(date);
} catch (DateTimeParseException e) {
log.warn("Failed to parse date: {}", cleanDate);
return Optional.empty();
for (Locale locale : LOCALES) {
try {
return convertToDate(locale, cleanDate, formatter);
} catch (DateTimeParseException e) {
try {
Optional<String> extractedDate = DateExtractorNatty.extractDate(cleanDate);
if (extractedDate.isEmpty()) {
log.warn("Failed to extract a valid date from value: {}", cleanDate);
return Optional.empty();
} else {
cleanDate = extractedDate.get();
return convertToDate(locale, cleanDate, formatter);
}
} catch (DateTimeParseException exception) {
log.debug("Failed to parse date: {} with locale: {}", cleanDate, locale);
}
}
}
log.warn("Failed to parse date: {}", cleanDate);
return Optional.empty();
}
@ -53,6 +70,14 @@ public class DateConverter {
}
private Optional<Date> convertToDate(Locale locale, String cleanDate, DateTimeFormatter formatter) {
LocalDate localDate = LocalDate.parse(cleanDate, formatter.withLocale(locale));
Date date = Date.from(localDate.atStartOfDay(ZoneId.systemDefault()).toInstant());
return Optional.of(date);
}
private DateTimeFormatter getDateTimeFormatter() {
if (DATE_TIME_FORMATTER == null) {
@ -65,13 +90,17 @@ public class DateConverter {
private DateTimeFormatter createFormatterFromResource() {
DateTimeFormatterBuilder builder = new DateTimeFormatterBuilder();
builder.parseCaseInsensitive();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(Objects.requireNonNull(DateConverter.class.getResourceAsStream("/date_formats.txt"))))) {
String line;
while ((line = reader.readLine()) != null) {
builder.appendOptional(DateTimeFormatter.ofPattern(line.trim(), Locale.UK));
String pattern = line.trim();
if (!pattern.isEmpty()) {
builder.appendOptional(DateTimeFormatter.ofPattern(pattern, Locale.UK));
}
}
} catch (IOException e) {
throw new RuntimeException("Error reading date format file: " + e.getMessage());
throw new RuntimeException("Error reading date format file: " + e.getMessage(), e);
}
return builder.toFormatter().withResolverStyle(ResolverStyle.SMART).withLocale(Locale.UK);
}

View File

@ -0,0 +1,26 @@
package com.iqser.red.service.redaction.v1.server.utils;
import java.util.List;
import java.util.Optional;
import com.joestelmach.natty.DateGroup;
import com.joestelmach.natty.Parser;
import lombok.experimental.UtilityClass;
@UtilityClass
public class DateExtractorNatty {
public Optional<String> extractDate(String text) {
Parser parser = new Parser();
List<DateGroup> groups = parser.parse(text);
if (!groups.isEmpty()) {
DateGroup group = groups.get(0);
String dateText = group.getText();
return Optional.of(dateText);
}
return Optional.empty();
}
}

View File

@ -1,6 +1,7 @@
dd-MMM-yyyy
dd MMM yyyy
dd MMM yy
d MMM yy
dd MM yyyy
dd MMMM yyyy
MMMM dd, yyyy
@ -34,5 +35,6 @@ dd.MMM.yy
dd.MMMM.yy
dd.MMM-yyyy
dd.MMMM-yyyy
dd. MMM yy
d['th']['st']['nd']['rd'] MMMM yy
d['th']['st']['nd']['rd'] MMMM yyyy

View File

@ -0,0 +1,54 @@
package com.iqser.red.service.redaction.v1.server.date;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import org.junit.jupiter.api.Test;
import com.iqser.red.service.redaction.v1.server.utils.DateConverter;
public class DateConverterTest {
@Test
public void testDateConverter() {
List<String> goldenStandardDates = Arrays.asList("3 Jun 08",
"09. Apr 09",
"07-Sep-2010",
"26-FEB-2008",
"30-APR-2008",
"30-apr-2008",
"30-Apr-2008",
"1 Apr 08",
"26-FEB-2008",
"19-MAR-2008",
"1 Apr 08",
"27-MAR-2008",
"06-MAY-2008",
"3 Apr 08",
"12-MAR-2008",
"08-APR-2008",
"1 Apr 08",
"4 Apr 08",
"13 November 2017 (animal 1)",
"16 November 2017 (animal 1)",
"27 March 2018 (animal 1 - 5000 mg/kg bw)",
"10 April 2018 (animal 1 - 5000 mg/kg bw)",
"13 November 2017 (animal 1)",
"16 November 2017 (animal 1)",
"28 March 2018 (animal 1 - 5000 mg/kg bw)",
"28 March 2018 (animal1 - 5000 mg/kg bw)",
"28 August 2018 (animal 1)",
"31 August 2018 (animal 1)");
for (String dateStr : goldenStandardDates) {
Optional<Date> parsedDate = DateConverter.parseDate(dateStr);
assertTrue(parsedDate.isPresent(), "Failed to parse date: " + dateStr);
}
}
}

View File

@ -0,0 +1,28 @@
package com.iqser.red.service.redaction.v1.server.date;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Optional;
import org.junit.jupiter.api.Test;
import com.iqser.red.service.redaction.v1.server.utils.DateExtractorNatty;
public class DateExtractorNattyTest {
@Test
public void testExtractDate() {
String[] testStrings = {"13 November 2017 (animal 1)", "16 November 2017 (animal 1)", "27 March 2018 (animal 1 - 5000 mg/kg bw)", "10 April 2018 (animal 1 - 5000 mg/kg bw)", "13 November 2017 (animal 1)", "16 November 2017 (animal 1)", "28 March 2018 (animal 1 - 5000 mg/kg bw)", "28 March 2018 (animal1 - 5000 mg/kg bw)", "28 August 2018 (animal 1)", "31 August 2018 (animal 1)"};
String[] expectedDates = {"13 November 2017", "16 November 2017", "27 March 2018", "10 April 2018", "13 November 2017", "16 November 2017", "28 March 2018", "28 March 2018", "28 August 2018", "31 August 2018"};
for (int i = 0; i < testStrings.length; i++) {
Optional<String> extractedDate = DateExtractorNatty.extractDate(testStrings[i]);
assertTrue(extractedDate.isPresent(), "No date found in: " + testStrings[i]);
assertEquals(expectedDates[i], extractedDate.get(), "Failed to extract correct date from: " + testStrings[i]);
}
}
}