1 package com.acumenvelocity.ath.common;
2
3 import java.io.File;
4 import java.io.IOException;
5 import java.io.InputStream;
6
7 import org.apache.pdfbox.Loader;
8 import org.apache.pdfbox.pdmodel.PDDocument;
9 import org.apache.pdfbox.text.PDFTextStripper;
10
11 import com.acumenvelocity.ath.model.OcrMode;
12 import com.adobe.pdfservices.operation.PDFServices;
13 import com.adobe.pdfservices.operation.PDFServicesMediaType;
14 import com.adobe.pdfservices.operation.PDFServicesResponse;
15 import com.adobe.pdfservices.operation.auth.Credentials;
16 import com.adobe.pdfservices.operation.auth.ServicePrincipalCredentials;
17 import com.adobe.pdfservices.operation.io.Asset;
18 import com.adobe.pdfservices.operation.io.StreamAsset;
19 import com.adobe.pdfservices.operation.pdfjobs.jobs.CreatePDFJob;
20 import com.adobe.pdfservices.operation.pdfjobs.jobs.ExportPDFJob;
21 import com.adobe.pdfservices.operation.pdfjobs.params.createpdf.CreatePDFParams;
22 import com.adobe.pdfservices.operation.pdfjobs.params.createpdf.word.DocumentLanguage;
23 import com.adobe.pdfservices.operation.pdfjobs.params.exportpdf.ExportOCRLocale;
24 import com.adobe.pdfservices.operation.pdfjobs.params.exportpdf.ExportPDFParams;
25 import com.adobe.pdfservices.operation.pdfjobs.params.exportpdf.ExportPDFTargetFormat;
26 import com.adobe.pdfservices.operation.pdfjobs.result.CreatePDFResult;
27 import com.adobe.pdfservices.operation.pdfjobs.result.ExportPDFResult;
28 import com.ibm.icu.util.ULocale;
29
30 import net.sf.okapi.common.LocaleId;
31 import net.sf.okapi.common.exceptions.OkapiIOException;
32
33 public final class PdfUtil {
34
35 private static PDFServices pdfServices;
36
37 private PdfUtil() {
38
39 }
40
41 public static void init() {
42 initializePdfServices();
43 }
44
45 private static void initializePdfServices() {
46 try {
47 String clientId = Const.ATH_PDF_CLIENT_ID;
48 String clientSecret = Const.ATH_PDF_CLIENT_SECRET;
49
50 if (clientId == null || clientSecret == null) {
51 throw new OkapiIOException("Adobe PDF Services credentials not configured. " +
52 "Please set ATH_PDF_CLIENT_ID and ATH_PDF_CLIENT_SECRET");
53 }
54
55 Credentials credentials = new ServicePrincipalCredentials(clientId, clientSecret);
56 pdfServices = new PDFServices(credentials);
57
58 } catch (Exception e) {
59 throw new OkapiIOException("Failed to initialize Adobe PDF Services", e);
60 }
61 }
62
63
64
65
66
67
68
69
70
71
72
73 public static boolean needsOcr(File pdfFile) {
74 if (pdfFile == null || !pdfFile.exists()) {
75 Log.warn(PdfUtil.class, "PDF file is null or missing: {}", pdfFile);
76 return true;
77 }
78
79 try (PDDocument document = Loader.loadPDF(pdfFile)) {
80 int numPages = document.getNumberOfPages();
81 if (numPages == 0) {
82 Log.debug(PdfUtil.class, "Empty PDF: {}", pdfFile.getName());
83 return true;
84 }
85
86 PDFTextStripper stripper = new PDFTextStripper();
87 stripper.setStartPage(1);
88 stripper.setEndPage(Math.min(5, numPages));
89
90 String text = stripper.getText(document);
91 if (text == null) {
92 Log.debug(PdfUtil.class, "No text extracted from '{}'; assuming OCR required.",
93 pdfFile.getName());
94 return true;
95 }
96
97 int visibleChars = text.replaceAll("\\s+", "").length();
98 boolean imageOnly = visibleChars < 10;
99
100 Log.info(PdfUtil.class, "PDF '{}' inspected: {} visible chars ({} pages) → needsOCR={}",
101 pdfFile.getName(), visibleChars, Math.min(5, numPages), imageOnly);
102
103 return imageOnly;
104
105 } catch (IOException e) {
106 Log.warn(PdfUtil.class, "Error reading PDF '{}': {}. Assuming OCR required.",
107 pdfFile.getName(),
108 e.getMessage());
109 return true;
110 }
111 }
112
113
114
115
116
117
118
119
120 public static ExportOCRLocale toAdobeLocale(LocaleId locale) {
121 if (locale == null) {
122 Log.warn(PdfUtil.class, "Warning: null locale provided, falling back to EN_US");
123 return ExportOCRLocale.EN_US;
124 }
125
126
127 String bcp47 = locale.toBCP47();
128 String bcp47Lower = bcp47.toLowerCase(ULocale.ROOT.toLocale());
129
130
131 try {
132
133 String enumName = bcp47.toUpperCase(ULocale.ROOT.toLocale()).replace('-', '_');
134 return ExportOCRLocale.valueOf(enumName);
135
136 } catch (IllegalArgumentException e) {
137
138 }
139
140
141 switch (bcp47Lower) {
142
143 case "en":
144 case "en-us":
145 return ExportOCRLocale.EN_US;
146
147 case "en-gb":
148 return ExportOCRLocale.EN_GB;
149
150
151 case "zh":
152 case "zh-cn":
153 case "zh-hans":
154 return ExportOCRLocale.ZH_CN;
155
156 case "zh-tw":
157 case "zh-hk":
158 case "zh-hant":
159 return ExportOCRLocale.ZH_HANT;
160
161
162 case "pt":
163 case "pt-pt":
164 return ExportOCRLocale.PT_PT;
165
166 case "pt-br":
167 return ExportOCRLocale.PT_BR;
168
169
170 case "nb":
171 case "no":
172 case "nb-no":
173 return ExportOCRLocale.NB_NO;
174
175 case "nn":
176 case "nn-no":
177 return ExportOCRLocale.NN_NO;
178
179
180 case "he":
181 case "he-il":
182 case "iw":
183 case "iw-il":
184 return ExportOCRLocale.IW_IL;
185
186
187 case "da":
188 return ExportOCRLocale.DA_DK;
189
190 case "lt":
191 return ExportOCRLocale.LT_LT;
192
193 case "sl":
194 return ExportOCRLocale.SL_SI;
195
196 case "el":
197 return ExportOCRLocale.EL_GR;
198
199 case "ru":
200 return ExportOCRLocale.RU_RU;
201
202 case "hu":
203 return ExportOCRLocale.HU_HU;
204
205 case "et":
206 return ExportOCRLocale.ET_EE;
207
208 case "uk":
209 return ExportOCRLocale.UK_UA;
210
211 case "pl":
212 return ExportOCRLocale.PL_PL;
213
214 case "lv":
215 return ExportOCRLocale.LV_LV;
216
217 case "fi":
218 return ExportOCRLocale.FI_FI;
219
220 case "ja":
221 return ExportOCRLocale.JA_JP;
222
223 case "es":
224 return ExportOCRLocale.ES_ES;
225
226 case "bg":
227 return ExportOCRLocale.BG_BG;
228
229 case "cs":
230 return ExportOCRLocale.CS_CZ;
231
232 case "mt":
233 return ExportOCRLocale.MT_MT;
234
235 case "de":
236 return ExportOCRLocale.DE_DE;
237
238 case "hr":
239 return ExportOCRLocale.HR_HR;
240
241 case "sk":
242 return ExportOCRLocale.SK_SK;
243
244 case "sr":
245 return ExportOCRLocale.SR_SR;
246
247 case "ca":
248 return ExportOCRLocale.CA_CA;
249
250 case "mk":
251 return ExportOCRLocale.MK_MK;
252
253 case "ko":
254 return ExportOCRLocale.KO_KR;
255
256 case "nl":
257 return ExportOCRLocale.NL_NL;
258
259 case "sv":
260 return ExportOCRLocale.SV_SE;
261
262 case "it":
263 return ExportOCRLocale.IT_IT;
264
265 case "tr":
266 return ExportOCRLocale.TR_TR;
267
268 case "fr":
269 return ExportOCRLocale.FR_FR;
270
271 case "ro":
272 return ExportOCRLocale.RO_RO;
273
274 case "eu":
275 return ExportOCRLocale.EU_ES;
276
277 case "gl":
278 return ExportOCRLocale.GL_ES;
279
280
281 case "de-ch":
282 return ExportOCRLocale.DE_CH;
283 }
284
285
286 String language = locale.getLanguage();
287
288 if (language != null) {
289 String languageLower = language.toLowerCase(ULocale.ROOT.toLocale());
290
291 switch (languageLower) {
292 case "en":
293 Log.warn(PdfUtil.class,
294 "Warning: locale '" + bcp47 + "' not directly supported, falling back to EN_US");
295
296 return ExportOCRLocale.EN_US;
297
298 case "de":
299 Log.warn(PdfUtil.class,
300 "Warning: locale '" + bcp47 + "' not directly supported, falling back to DE_DE");
301
302 return ExportOCRLocale.DE_DE;
303
304 case "fr":
305 Log.warn(PdfUtil.class,
306 "Warning: locale '" + bcp47 + "' not directly supported, falling back to FR_FR");
307
308 return ExportOCRLocale.FR_FR;
309
310 case "es":
311 Log.warn(PdfUtil.class,
312 "Warning: locale '" + bcp47 + "' not directly supported, falling back to ES_ES");
313
314 return ExportOCRLocale.ES_ES;
315
316 case "it":
317 Log.warn(PdfUtil.class,
318 "Warning: locale '" + bcp47 + "' not directly supported, falling back to IT_IT");
319
320 return ExportOCRLocale.IT_IT;
321
322 case "pt":
323 Log.warn(PdfUtil.class,
324 "Warning: locale '" + bcp47 + "' not directly supported, falling back to PT_PT");
325
326 return ExportOCRLocale.PT_PT;
327
328 case "zh":
329 Log.warn(PdfUtil.class,
330 "Warning: locale '" + bcp47 + "' not directly supported, falling back to ZH_CN");
331
332 return ExportOCRLocale.ZH_CN;
333
334 case "nb":
335 case "no":
336 case "nn":
337 Log.warn(PdfUtil.class,
338 "Warning: locale '" + bcp47 + "' not directly supported, falling back to NB_NO");
339
340 return ExportOCRLocale.NB_NO;
341 }
342 }
343
344
345 Log.warn(PdfUtil.class, "Warning: locale '" + bcp47
346 + "' not supported by Adobe PDF Services, falling back to EN_US");
347
348 return ExportOCRLocale.EN_US;
349 }
350
351
352
353
354
355
356
357
358 public static DocumentLanguage getDocumentLanguage(LocaleId locale) {
359 if (locale == null) {
360 Log.warn(PdfUtil.class, "Warning: null locale provided, falling back to EN_US");
361 return DocumentLanguage.EN_US;
362 }
363
364
365 String bcp47 = locale.toBCP47();
366 String bcp47Lower = bcp47.toLowerCase(ULocale.ROOT.toLocale());
367
368
369 try {
370
371 String enumName = bcp47.toUpperCase(ULocale.ROOT.toLocale()).replace('-', '_');
372 return DocumentLanguage.valueOf(enumName);
373
374 } catch (IllegalArgumentException e) {
375
376 }
377
378
379 switch (bcp47Lower) {
380
381 case "en":
382 case "en-us":
383 return DocumentLanguage.EN_US;
384
385 case "en-gb":
386 return DocumentLanguage.EN_GB;
387
388
389 case "zh":
390 case "zh-cn":
391 case "zh-hans":
392 return DocumentLanguage.ZH_CN;
393
394 case "zh-tw":
395 case "zh-hant":
396 return DocumentLanguage.ZH_HK;
397
398 case "zh-hk":
399 return DocumentLanguage.ZH_HK;
400
401
402 case "pt":
403 case "pt-pt":
404 return DocumentLanguage.PT_BR;
405
406 case "pt-br":
407 return DocumentLanguage.PT_BR;
408
409
410 case "nb":
411 case "no":
412 case "nb-no":
413 case "no-no":
414 return DocumentLanguage.NO_NO;
415
416 case "nn":
417 case "nn-no":
418 return DocumentLanguage.NB_NO;
419
420
421 case "he":
422 case "he-il":
423 case "iw":
424 case "iw-il":
425 return DocumentLanguage.IW_IL;
426
427
428 case "da":
429 return DocumentLanguage.DA_DK;
430
431 case "lt":
432 return DocumentLanguage.LT_LT;
433
434 case "sl":
435 return DocumentLanguage.SL_SI;
436
437 case "el":
438 return DocumentLanguage.EL_GR;
439
440 case "ru":
441 return DocumentLanguage.RU_RU;
442
443 case "hu":
444 return DocumentLanguage.HU_HU;
445
446 case "et":
447 return DocumentLanguage.ET_EE;
448
449 case "uk":
450 return DocumentLanguage.UK_UA;
451
452 case "pl":
453 return DocumentLanguage.PL_PL;
454
455 case "lv":
456 return DocumentLanguage.LV_LV;
457
458 case "fi":
459 return DocumentLanguage.FI_FI;
460
461 case "ja":
462 return DocumentLanguage.JA_JP;
463
464 case "es":
465 return DocumentLanguage.ES_ES;
466
467 case "bg":
468 return DocumentLanguage.BG_BG;
469
470 case "cs":
471 return DocumentLanguage.CS_CZ;
472
473 case "mt":
474 return DocumentLanguage.MT_MT;
475
476 case "de":
477 return DocumentLanguage.DE_DE;
478
479 case "hr":
480 return DocumentLanguage.HR_HR;
481
482 case "sk":
483 return DocumentLanguage.SK_SK;
484
485 case "sr":
486 return DocumentLanguage.SR_SR;
487
488 case "ca":
489 return DocumentLanguage.CA_CA;
490
491 case "mk":
492 return DocumentLanguage.MK_MK;
493
494 case "ko":
495 return DocumentLanguage.KO_KR;
496
497 case "nl":
498 return DocumentLanguage.NL_NL;
499
500 case "sv":
501 return DocumentLanguage.SV_SE;
502
503 case "it":
504 return DocumentLanguage.IT_IT;
505
506 case "tr":
507 return DocumentLanguage.TR_TR;
508
509 case "fr":
510 return DocumentLanguage.FR_FR;
511
512 case "ro":
513 return DocumentLanguage.RO_RO;
514
515
516 case "de-ch":
517 return DocumentLanguage.DE_CH;
518
519
520 case "de-at":
521 Log.warn(PdfUtil.class, "Warning: locale '{}' not directly supported, falling back to DE_DE",
522 bcp47);
523
524 return DocumentLanguage.DE_DE;
525 }
526
527
528 String language = locale.getLanguage();
529
530 if (language != null) {
531 String languageLower = language.toLowerCase(ULocale.ROOT.toLocale());
532
533 switch (languageLower) {
534 case "en":
535 Log.warn(PdfUtil.class,
536 "Warning: locale '{}' not directly supported, falling back to EN_US", bcp47);
537
538 return DocumentLanguage.EN_US;
539
540 case "de":
541 Log.warn(PdfUtil.class,
542 "Warning: locale '" + bcp47 + "' not directly supported, falling back to DE_DE");
543
544 return DocumentLanguage.DE_DE;
545
546 case "fr":
547 Log.warn(PdfUtil.class,
548 "Warning: locale '" + bcp47 + "' not directly supported, falling back to FR_FR");
549
550 return DocumentLanguage.FR_FR;
551
552 case "es":
553 Log.warn(PdfUtil.class,
554 "Warning: locale '" + bcp47 + "' not directly supported, falling back to ES_ES");
555
556 return DocumentLanguage.ES_ES;
557
558 case "it":
559 Log.warn(PdfUtil.class,
560 "Warning: locale '" + bcp47 + "' not directly supported, falling back to IT_IT");
561
562 return DocumentLanguage.IT_IT;
563
564 case "pt":
565 Log.warn(PdfUtil.class,
566 "Warning: locale '" + bcp47 + "' not directly supported, falling back to PT_BR");
567
568 return DocumentLanguage.PT_BR;
569
570 case "zh":
571 Log.warn(PdfUtil.class,
572 "Warning: locale '" + bcp47 + "' not directly supported, falling back to ZH_CN");
573
574 return DocumentLanguage.ZH_CN;
575
576 case "nb":
577 case "no":
578 case "nn":
579 Log.warn(PdfUtil.class,
580 "Warning: locale '" + bcp47 + "' not directly supported, falling back to NO_NO");
581
582 return DocumentLanguage.NO_NO;
583 }
584 }
585
586
587 Log.warn(PdfUtil.class, "Warning: locale '" + bcp47
588 + "' not supported by Adobe PDF Services, falling back to EN_US");
589
590 return DocumentLanguage.EN_US;
591 }
592
593
594
595
596 public static InputStream convertPdfToDocx(InputStream pdfInputStream, LocaleId locale,
597 OcrMode ocrMode) throws Exception {
598 try {
599
600 Asset asset = pdfServices.upload(pdfInputStream, PDFServicesMediaType.PDF.getMediaType());
601
602 ExportPDFParams.Builder paramsBuilder = ExportPDFParams
603 .exportPDFParamsBuilder(ExportPDFTargetFormat.DOCX);
604
605 if (ocrMode == OcrMode.ENABLED) {
606
607 paramsBuilder.withExportOCRLocale(PdfUtil.toAdobeLocale(locale));
608 }
609
610 ExportPDFParams exportParams = paramsBuilder.build();
611
612
613 ExportPDFJob exportJob = new ExportPDFJob(asset, exportParams);
614 String location = pdfServices.submit(exportJob);
615
616
617 PDFServicesResponse<ExportPDFResult> response = pdfServices.getJobResult(location,
618 ExportPDFResult.class);
619
620
621 Asset resultAsset = response.getResult().getAsset();
622 StreamAsset streamAsset = pdfServices.getContent(resultAsset);
623
624 return streamAsset.getInputStream();
625
626 } catch (Exception e) {
627 throw new OkapiIOException("Adobe PDF Services conversion failed", e);
628 }
629 }
630
631
632
633
634 public static InputStream convertDocxToPdf(InputStream docxInputStream, LocaleId locale)
635 throws Exception {
636 try {
637 Log.info(PdfUtil.class, "Uploading DOCX to Adobe PDF Services...");
638
639
640 Asset asset = pdfServices.upload(docxInputStream, PDFServicesMediaType.DOCX.getMediaType());
641
642 Log.info(PdfUtil.class, "Creating PDF from DOCX...");
643
644
645 CreatePDFParams createParams = CreatePDFParams.wordParamsBuilder()
646 .withDocumentLanguage(PdfUtil.getDocumentLanguage(locale))
647 .build();
648
649
650 CreatePDFJob createJob = new CreatePDFJob(asset).setParams(createParams);
651 String location = pdfServices.submit(createJob);
652
653 Log.info(PdfUtil.class, "Waiting for PDF creation job to complete...");
654
655
656 PDFServicesResponse<CreatePDFResult> response = pdfServices.getJobResult(location,
657 CreatePDFResult.class);
658
659
660 Asset resultAsset = response.getResult().getAsset();
661 StreamAsset streamAsset = pdfServices.getContent(resultAsset);
662
663 Log.info(PdfUtil.class, "PDF created successfully");
664
665 return streamAsset.getInputStream();
666
667 } catch (Exception e) {
668 throw new OkapiIOException("Adobe PDF Services creation failed", e);
669 }
670 }
671
672
673
674
675 public static PDFServices getPdfServices() {
676 return pdfServices;
677 }
678 }