From d5769ced154aa8627dfc09499b5ff1a04d66231f Mon Sep 17 00:00:00 2001 From: Andrei Isvoran Date: Wed, 17 Apr 2024 15:26:13 +0300 Subject: [PATCH] RED-8650 - Support more date formats --- .../v1/server/utils/DateConverter.java | 77 +++++++++++++----- .../src/main/resources/date_formats.txt | 38 +++++++++ .../v1/server/DocumineFloraTest.java | 32 ++++++++ .../resources/files/dates/date_formats.pdf | Bin 0 -> 3481 bytes 4 files changed, 126 insertions(+), 21 deletions(-) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/resources/date_formats.txt create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/files/dates/date_formats.pdf diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateConverter.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateConverter.java index ccf8d832..f7857be0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateConverter.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/utils/DateConverter.java @@ -1,10 +1,19 @@ package com.iqser.red.service.redaction.v1.server.utils; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; import java.text.DateFormat; import java.text.SimpleDateFormat; +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.format.DateTimeParseException; +import java.time.format.ResolverStyle; import java.util.Date; -import java.util.List; import java.util.Locale; +import java.util.Objects; import java.util.Optional; import lombok.AccessLevel; @@ -17,39 +26,65 @@ import lombok.extern.slf4j.Slf4j; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class DateConverter { - static List formats = List.of(new SimpleDateFormat("dd MMM yy", Locale.ENGLISH), - new SimpleDateFormat("dd MM yyyy", Locale.ENGLISH), - new SimpleDateFormat("dd MM yyyy.", Locale.ENGLISH), - new SimpleDateFormat("dd MMMM yyyy", Locale.ENGLISH), - new SimpleDateFormat("MMMM dd, yyyy", Locale.ENGLISH), - new SimpleDateFormat("dd-MMM-yyyy", Locale.ENGLISH)); + private static DateTimeFormatter DATE_TIME_FORMATTER; public Optional parseDate(String dateAsString) { - Date date = null; - for (SimpleDateFormat format : formats) { - - try { - date = format.parse(dateAsString); - break; - } catch (Exception e) { - log.warn("Failed to parse date from string {}. \n{}", dateAsString, e.getMessage()); - // ignore, try next... - } - } - if (date == null) { + DateTimeFormatter formatter = getDateTimeFormatter(); + String cleanDate = dateAsString.trim(); + cleanDate = removeTrailingDot(cleanDate); + try { + LocalDate localDate = LocalDate.parse(cleanDate, formatter); + Date date = Date.from(localDate.atStartOfDay(ZoneId.systemDefault()).toInstant()); + return Optional.of(date); + } catch (DateTimeParseException e) { + log.warn("Failed to parse date: {}", cleanDate); return Optional.empty(); } - return Optional.of(date); + } public String convertDate(Date date, String resultFormat) { DateFormat resultDateFormat = new SimpleDateFormat(resultFormat, Locale.ENGLISH); - return resultDateFormat.format(date); } + + private DateTimeFormatter getDateTimeFormatter() { + + if (DATE_TIME_FORMATTER == null) { + DATE_TIME_FORMATTER = createFormatterFromResource(); + } + return DATE_TIME_FORMATTER; + } + + + private DateTimeFormatter createFormatterFromResource() { + + DateTimeFormatterBuilder builder = new DateTimeFormatterBuilder(); + try (BufferedReader reader = new BufferedReader(new InputStreamReader(Objects.requireNonNull(DateConverter.class.getResourceAsStream("/date_formats.txt"))))) { + String line; + while ((line = reader.readLine()) != null) { + builder.appendOptional(DateTimeFormatter.ofPattern(line.trim(), Locale.ENGLISH)); + } + } catch (IOException e) { + throw new RuntimeException("Error reading date format file: " + e.getMessage()); + } + return builder.toFormatter().withResolverStyle(ResolverStyle.SMART).withLocale(Locale.ENGLISH); + } + + + private String removeTrailingDot(String dateAsString) { + + String str = dateAsString; + if (str != null && !str.isEmpty() && str.charAt(str.length() - 1) == '.') { + str = str.substring(0, str.length() - 1); + } + + return str; + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/resources/date_formats.txt b/redaction-service-v1/redaction-service-server-v1/src/main/resources/date_formats.txt new file mode 100644 index 00000000..b4fb4351 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/resources/date_formats.txt @@ -0,0 +1,38 @@ +dd MMM yyyy +dd MMM yy +dd MM yyyy +dd MMMM yyyy +MMMM dd, yyyy +dd-MMM-yyyy +dd.MM.yyyy +yyyy/MM/dd +yyyy-MM-dd +dd-MM-yyyy +MMMM d, yyyy +d MMMM yyyy +MMM d, yyyy +d['.'] MMM yyyy +d-MMM-yyyy +d['th']['st']['nd']['rd'] 'of' MMMM, yyyy +MMMM d['th']['st']['nd']['rd'], yyyy +yyyy, MMMM d +yyyy.MM.dd +yyyyMMdd +dd-MM-yy +dd/MM/yy +MMMM d, yy +d MMMM, yy +d['th']['st']['nd']['rd'] MMM yyyy +MMM d['th']['st']['nd']['rd'], yy +yyyy-MMM-dd +MMM-dd-yyyy +dd-MM-yyyy +yyyy, MMMM dd +dd.MMM.yyyy +dd.MMMM.yyyy +dd.MMM.yy +dd.MMMM.yy +dd.MMM-yyyy +dd.MMMM-yyyy +d['th']['st']['nd']['rd'] MMMM yy +d['th']['st']['nd']['rd'] MMMM yyyy \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java index cdafda65..c452b809 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java @@ -1,5 +1,7 @@ package com.iqser.red.service.redaction.v1.server; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.when; import java.io.FileOutputStream; @@ -106,6 +108,36 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest { } + @Test + public void testConvertingVariousDateFormats() { + + AnalyzeRequest request = uploadFileToStorage("files/dates/date_formats.pdf"); + System.out.println("Start Full integration test"); + analyzeDocumentStructure(LayoutParsingType.DOCUMINE, request); + System.out.println("Finished structure analysis"); + analyzeService.analyze(request); + System.out.println("Finished analysis"); + + var componentLog = redactionStorageService.getComponentLog(TEST_DOSSIER_ID, TEST_FILE_ID); + assertNotNull(componentLog); + + var experimentalDates = componentLog.getComponentLogEntries().stream().filter(c -> c.getName().equals("Experimental_Starting_Date")).findFirst().get(); + assertNotNull(experimentalDates); + + String dates = experimentalDates.getComponentValues().get(0).getValue(); + String[] dateArray = dates.split(", "); + boolean allEqual = true; + for (String date : dateArray) { + if (!"10/01/2022".equals(date)) { + allEqual = false; + break; + } + } + + assertTrue(allEqual); + } + + @Test // @Disabled public void testTopOfPage13InNotHeader() throws IOException { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/dates/date_formats.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/dates/date_formats.pdf new file mode 100644 index 0000000000000000000000000000000000000000..553b18be292b739ad8cdb5bed272fe0f55d5d38c GIT binary patch literal 3481 zcmY!laBl4!6vllGGvveYc#%l2n(}WT16i`aY?7=_MHoh6W%_#U(|liMhO76?3$oFflN^ zW2oT;83YlA8EHv5li4&?4DEq^iZ+X0Oig+Vv;rkvRl zVv$_q=9iMP&Y)xaI<9G)yDB6m9m>ws(pP_P>U!SeOY41AE}`&EihlD#KX84OUHJIv zBJFeQ3s^6l@P1{mk4uW%sU`GH?&=x*ZXehAefizu<=7b;x~ta7b;85*@kalCo;H8d z*xZFBOB53620uJ8sIC$YsfTGHD@55UcDQwj+HaA_bV(7kbhswqkE{dufRjzs+yx?&C zK*04A2TphdE2?lOHD+^&Gkgh=zJ|@!=0v;Nf!E$55H;YP$) zEtV99_5mM!q0P0z0hmVCn`~SWbRL?x+d%=IS}gk(o4bjxz9>v3LMxQF7E^dcUPU95*C*uhc=|G<)4%lT$0PB z@26mB%%$&^SyWu2U||Ac;ci74gNzA6O-56LajD#|7|dPlZVy*{veM>ty8xt zFKKkVTr(k8^>TquvbOi<_v(>)lC66iKPum~{eGugkAX`uLBfe;hSCv%1SU=wMkY3a z)|rJ)XCx9*6xezc`FPD6EtaaWOwwRHqhLCv!D*IiL9OpH_rJcS4?en?K5w@AW;e6E zDSPFtIoXVH{{%JLOj10IQ*`z%`m*<@+brJL&al#7^Taot5iij1?ki9K!La;t<|W>J zv&1I{$<`F9pHsgYYFlAf8E|zrk6^M1*T=3!r#HL_Db;E96!dwa(weSsbmCFEe9fl{ zoqgRw&E+}2!lo3dyFQ(~^~%-_X1V;=E_~c3DmsC6eRW}Ra`imz$LwzJmaI$>o_%oL zwv^gD)(NWjyYFoj3%YV`zv|CZXKqgLuw(775&Ozt9R2=GKL5YfDT`ef^V)9pInidn zm~RGiiGT%%tP5WfdndE{p*dVuq9s}#>+Z1!dfEl+Z`kDv=4WxX%oZrymz`*rg3Ws!*fEPDgF-CNX`f6K@Z) zoOhekU$I+u(cB{jrX|TWk17+zKP@`{CMoJV)7w10Ud_yZfv5 z9j;mS>ec&4Kiogg@A#pA0ZS?bryo!o%hcEs*4#Em&QqQ)3eh$O&W;8a2Il6DmQE&) zE-uEdPUgmjPDU1PMsAjlu9jwY3WSx!a_M{KrR6Jt8lS-KMNoczi2}$XAUim-DpkQ4 znC2n*AR;I=4Wu|FwL&4<04M>}8ml&lZLTXhoFj8AG8a2a)E7 zdql-9%w4!>!@Un#s$7S#1aWam WVo^y&5ztS@hDN}in5wJ48y5f*F%