View Javadoc
1   package com.acumenvelocity.ath.common;
2   
3   import java.io.File;
4   import java.io.IOException;
5   import java.io.InputStream;
6   
7   import org.apache.pdfbox.Loader;
8   import org.apache.pdfbox.pdmodel.PDDocument;
9   import org.apache.pdfbox.text.PDFTextStripper;
10  
11  import com.acumenvelocity.ath.model.OcrMode;
12  import com.adobe.pdfservices.operation.PDFServices;
13  import com.adobe.pdfservices.operation.PDFServicesMediaType;
14  import com.adobe.pdfservices.operation.PDFServicesResponse;
15  import com.adobe.pdfservices.operation.auth.Credentials;
16  import com.adobe.pdfservices.operation.auth.ServicePrincipalCredentials;
17  import com.adobe.pdfservices.operation.io.Asset;
18  import com.adobe.pdfservices.operation.io.StreamAsset;
19  import com.adobe.pdfservices.operation.pdfjobs.jobs.CreatePDFJob;
20  import com.adobe.pdfservices.operation.pdfjobs.jobs.ExportPDFJob;
21  import com.adobe.pdfservices.operation.pdfjobs.params.createpdf.CreatePDFParams;
22  import com.adobe.pdfservices.operation.pdfjobs.params.createpdf.word.DocumentLanguage;
23  import com.adobe.pdfservices.operation.pdfjobs.params.exportpdf.ExportOCRLocale;
24  import com.adobe.pdfservices.operation.pdfjobs.params.exportpdf.ExportPDFParams;
25  import com.adobe.pdfservices.operation.pdfjobs.params.exportpdf.ExportPDFTargetFormat;
26  import com.adobe.pdfservices.operation.pdfjobs.result.CreatePDFResult;
27  import com.adobe.pdfservices.operation.pdfjobs.result.ExportPDFResult;
28  import com.ibm.icu.util.ULocale;
29  
30  import net.sf.okapi.common.LocaleId;
31  import net.sf.okapi.common.exceptions.OkapiIOException;
32  
33  public final class PdfUtil {
34  
35    private static PDFServices pdfServices;
36  
37    private PdfUtil() {
38      // Utility class — no instances
39    }
40  
41    public static void init() {
42      initializePdfServices();
43    }
44  
45    private static void initializePdfServices() {
46      try {
47        String clientId = Const.ATH_PDF_CLIENT_ID;
48        String clientSecret = Const.ATH_PDF_CLIENT_SECRET;
49  
50        if (clientId == null || clientSecret == null) {
51          throw new OkapiIOException("Adobe PDF Services credentials not configured. " +
52              "Please set ATH_PDF_CLIENT_ID and ATH_PDF_CLIENT_SECRET");
53        }
54  
55        Credentials credentials = new ServicePrincipalCredentials(clientId, clientSecret);
56        pdfServices = new PDFServices(credentials);
57  
58      } catch (Exception e) {
59        throw new OkapiIOException("Failed to initialize Adobe PDF Services", e);
60      }
61    }
62  
63    /**
64     * Determines whether the given PDF file likely needs OCR (i.e., contains no selectable text).
65     * <p>
66     * Works with PDFBox 3.x (uses Loader.loadPDF()).
67     * </p>
68     * 
69     * @param pdfFile the local PDF file
70     * @return {@code true} if the PDF appears image-only (no selectable text),
71     *         {@code false} otherwise
72     */
73    public static boolean needsOcr(File pdfFile) {
74      if (pdfFile == null || !pdfFile.exists()) {
75        Log.warn(PdfUtil.class, "PDF file is null or missing: {}", pdfFile);
76        return true; // default to OCR
77      }
78  
79      try (PDDocument document = Loader.loadPDF(pdfFile)) {
80        int numPages = document.getNumberOfPages();
81        if (numPages == 0) {
82          Log.debug(PdfUtil.class, "Empty PDF: {}", pdfFile.getName());
83          return true;
84        }
85  
86        PDFTextStripper stripper = new PDFTextStripper();
87        stripper.setStartPage(1);
88        stripper.setEndPage(Math.min(5, numPages)); // check first few pages
89  
90        String text = stripper.getText(document);
91        if (text == null) {
92          Log.debug(PdfUtil.class, "No text extracted from '{}'; assuming OCR required.",
93              pdfFile.getName());
94          return true;
95        }
96  
97        int visibleChars = text.replaceAll("\\s+", "").length();
98        boolean imageOnly = visibleChars < 10; // heuristic threshold
99  
100       Log.info(PdfUtil.class, "PDF '{}' inspected: {} visible chars ({} pages) → needsOCR={}",
101           pdfFile.getName(), visibleChars, Math.min(5, numPages), imageOnly);
102 
103       return imageOnly;
104 
105     } catch (IOException e) {
106       Log.warn(PdfUtil.class, "Error reading PDF '{}': {}. Assuming OCR required.",
107           pdfFile.getName(),
108           e.getMessage());
109       return true;
110     }
111   }
112 
113   /**
114    * Converts an Okapi LocaleId to Adobe ExportOCRLocale.
115    * Falls back to EN_US with a warning if the locale is not supported.
116    * 
117    * @param locale the Okapi LocaleId to convert
118    * @return the corresponding ExportOCRLocale, or EN_US as fallback
119    */
120   public static ExportOCRLocale toAdobeLocale(LocaleId locale) {
121     if (locale == null) {
122       Log.warn(PdfUtil.class, "Warning: null locale provided, falling back to EN_US");
123       return ExportOCRLocale.EN_US;
124     }
125 
126     // Convert to BCP-47 format and normalize to lowercase for comparison
127     String bcp47 = locale.toBCP47();
128     String bcp47Lower = bcp47.toLowerCase(ULocale.ROOT.toLocale());
129 
130     // Direct mapping attempts
131     try {
132       // Try exact match first (e.g., "en-US" -> EN_US)
133       String enumName = bcp47.toUpperCase(ULocale.ROOT.toLocale()).replace('-', '_');
134       return ExportOCRLocale.valueOf(enumName);
135 
136     } catch (IllegalArgumentException e) {
137       // Not a direct match, continue with manual mapping
138     }
139 
140     // Manual mapping for common cases and special handling
141     switch (bcp47Lower) {
142     // English variants
143     case "en":
144     case "en-us":
145       return ExportOCRLocale.EN_US;
146 
147     case "en-gb":
148       return ExportOCRLocale.EN_GB;
149 
150     // Chinese variants
151     case "zh":
152     case "zh-cn":
153     case "zh-hans":
154       return ExportOCRLocale.ZH_CN;
155 
156     case "zh-tw":
157     case "zh-hk":
158     case "zh-hant":
159       return ExportOCRLocale.ZH_HANT;
160 
161     // Portuguese variants
162     case "pt":
163     case "pt-pt":
164       return ExportOCRLocale.PT_PT;
165 
166     case "pt-br":
167       return ExportOCRLocale.PT_BR;
168 
169     // Norwegian variants
170     case "nb":
171     case "no":
172     case "nb-no":
173       return ExportOCRLocale.NB_NO;
174 
175     case "nn":
176     case "nn-no":
177       return ExportOCRLocale.NN_NO;
178 
179     // Hebrew (special case: iw vs he)
180     case "he":
181     case "he-il":
182     case "iw":
183     case "iw-il":
184       return ExportOCRLocale.IW_IL;
185 
186     // Language-only codes - try to match with region
187     case "da":
188       return ExportOCRLocale.DA_DK;
189 
190     case "lt":
191       return ExportOCRLocale.LT_LT;
192 
193     case "sl":
194       return ExportOCRLocale.SL_SI;
195 
196     case "el":
197       return ExportOCRLocale.EL_GR;
198 
199     case "ru":
200       return ExportOCRLocale.RU_RU;
201 
202     case "hu":
203       return ExportOCRLocale.HU_HU;
204 
205     case "et":
206       return ExportOCRLocale.ET_EE;
207 
208     case "uk":
209       return ExportOCRLocale.UK_UA;
210 
211     case "pl":
212       return ExportOCRLocale.PL_PL;
213 
214     case "lv":
215       return ExportOCRLocale.LV_LV;
216 
217     case "fi":
218       return ExportOCRLocale.FI_FI;
219 
220     case "ja":
221       return ExportOCRLocale.JA_JP;
222 
223     case "es":
224       return ExportOCRLocale.ES_ES;
225 
226     case "bg":
227       return ExportOCRLocale.BG_BG;
228 
229     case "cs":
230       return ExportOCRLocale.CS_CZ;
231 
232     case "mt":
233       return ExportOCRLocale.MT_MT;
234 
235     case "de":
236       return ExportOCRLocale.DE_DE;
237 
238     case "hr":
239       return ExportOCRLocale.HR_HR;
240 
241     case "sk":
242       return ExportOCRLocale.SK_SK;
243 
244     case "sr":
245       return ExportOCRLocale.SR_SR;
246 
247     case "ca":
248       return ExportOCRLocale.CA_CA;
249 
250     case "mk":
251       return ExportOCRLocale.MK_MK;
252 
253     case "ko":
254       return ExportOCRLocale.KO_KR;
255 
256     case "nl":
257       return ExportOCRLocale.NL_NL;
258 
259     case "sv":
260       return ExportOCRLocale.SV_SE;
261 
262     case "it":
263       return ExportOCRLocale.IT_IT;
264 
265     case "tr":
266       return ExportOCRLocale.TR_TR;
267 
268     case "fr":
269       return ExportOCRLocale.FR_FR;
270 
271     case "ro":
272       return ExportOCRLocale.RO_RO;
273 
274     case "eu":
275       return ExportOCRLocale.EU_ES;
276 
277     case "gl":
278       return ExportOCRLocale.GL_ES;
279 
280     // German Switzerland special case
281     case "de-ch":
282       return ExportOCRLocale.DE_CH;
283     }
284 
285     // Try language-based fallback for region variants
286     String language = locale.getLanguage();
287 
288     if (language != null) {
289       String languageLower = language.toLowerCase(ULocale.ROOT.toLocale());
290 
291       switch (languageLower) {
292       case "en":
293         Log.warn(PdfUtil.class,
294             "Warning: locale '" + bcp47 + "' not directly supported, falling back to EN_US");
295 
296         return ExportOCRLocale.EN_US;
297 
298       case "de":
299         Log.warn(PdfUtil.class,
300             "Warning: locale '" + bcp47 + "' not directly supported, falling back to DE_DE");
301 
302         return ExportOCRLocale.DE_DE;
303 
304       case "fr":
305         Log.warn(PdfUtil.class,
306             "Warning: locale '" + bcp47 + "' not directly supported, falling back to FR_FR");
307 
308         return ExportOCRLocale.FR_FR;
309 
310       case "es":
311         Log.warn(PdfUtil.class,
312             "Warning: locale '" + bcp47 + "' not directly supported, falling back to ES_ES");
313 
314         return ExportOCRLocale.ES_ES;
315 
316       case "it":
317         Log.warn(PdfUtil.class,
318             "Warning: locale '" + bcp47 + "' not directly supported, falling back to IT_IT");
319 
320         return ExportOCRLocale.IT_IT;
321 
322       case "pt":
323         Log.warn(PdfUtil.class,
324             "Warning: locale '" + bcp47 + "' not directly supported, falling back to PT_PT");
325 
326         return ExportOCRLocale.PT_PT;
327 
328       case "zh":
329         Log.warn(PdfUtil.class,
330             "Warning: locale '" + bcp47 + "' not directly supported, falling back to ZH_CN");
331 
332         return ExportOCRLocale.ZH_CN;
333 
334       case "nb":
335       case "no":
336       case "nn":
337         Log.warn(PdfUtil.class,
338             "Warning: locale '" + bcp47 + "' not directly supported, falling back to NB_NO");
339 
340         return ExportOCRLocale.NB_NO;
341       }
342     }
343 
344     // Final fallback
345     Log.warn(PdfUtil.class, "Warning: locale '" + bcp47
346         + "' not supported by Adobe PDF Services, falling back to EN_US");
347 
348     return ExportOCRLocale.EN_US;
349   }
350 
351   /**
352    * Converts an Okapi LocaleId to Adobe DocumentLanguage.
353    * Falls back to EN_US with a warning if the locale is not supported.
354    * 
355    * @param locale the Okapi LocaleId to convert
356    * @return the corresponding DocumentLanguage, or EN_US as fallback
357    */
358   public static DocumentLanguage getDocumentLanguage(LocaleId locale) {
359     if (locale == null) {
360       Log.warn(PdfUtil.class, "Warning: null locale provided, falling back to EN_US");
361       return DocumentLanguage.EN_US;
362     }
363 
364     // Convert to BCP-47 format and normalize to lowercase for comparison
365     String bcp47 = locale.toBCP47();
366     String bcp47Lower = bcp47.toLowerCase(ULocale.ROOT.toLocale());
367 
368     // Direct mapping attempts
369     try {
370       // Try exact match first (e.g., "en-US" -> EN_US)
371       String enumName = bcp47.toUpperCase(ULocale.ROOT.toLocale()).replace('-', '_');
372       return DocumentLanguage.valueOf(enumName);
373 
374     } catch (IllegalArgumentException e) {
375       // Not a direct match, continue with manual mapping
376     }
377 
378     // Manual mapping for common cases and special handling
379     switch (bcp47Lower) {
380     // English variants
381     case "en":
382     case "en-us":
383       return DocumentLanguage.EN_US;
384 
385     case "en-gb":
386       return DocumentLanguage.EN_GB;
387 
388     // Chinese variants
389     case "zh":
390     case "zh-cn":
391     case "zh-hans":
392       return DocumentLanguage.ZH_CN;
393 
394     case "zh-tw":
395     case "zh-hant":
396       return DocumentLanguage.ZH_HK; // Fallback TW to HK
397 
398     case "zh-hk":
399       return DocumentLanguage.ZH_HK;
400 
401     // Portuguese variants
402     case "pt":
403     case "pt-pt":
404       return DocumentLanguage.PT_BR; // Fallback PT to BR (more common)
405 
406     case "pt-br":
407       return DocumentLanguage.PT_BR;
408 
409     // Norwegian variants
410     case "nb":
411     case "no":
412     case "nb-no":
413     case "no-no":
414       return DocumentLanguage.NO_NO;
415 
416     case "nn":
417     case "nn-no":
418       return DocumentLanguage.NB_NO; // Fallback Nynorsk to Bokmål
419 
420     // Hebrew (special case: iw vs he)
421     case "he":
422     case "he-il":
423     case "iw":
424     case "iw-il":
425       return DocumentLanguage.IW_IL;
426 
427     // Language-only codes - try to match with region
428     case "da":
429       return DocumentLanguage.DA_DK;
430 
431     case "lt":
432       return DocumentLanguage.LT_LT;
433 
434     case "sl":
435       return DocumentLanguage.SL_SI;
436 
437     case "el":
438       return DocumentLanguage.EL_GR;
439 
440     case "ru":
441       return DocumentLanguage.RU_RU;
442 
443     case "hu":
444       return DocumentLanguage.HU_HU;
445 
446     case "et":
447       return DocumentLanguage.ET_EE;
448 
449     case "uk":
450       return DocumentLanguage.UK_UA;
451 
452     case "pl":
453       return DocumentLanguage.PL_PL;
454 
455     case "lv":
456       return DocumentLanguage.LV_LV;
457 
458     case "fi":
459       return DocumentLanguage.FI_FI;
460 
461     case "ja":
462       return DocumentLanguage.JA_JP;
463 
464     case "es":
465       return DocumentLanguage.ES_ES;
466 
467     case "bg":
468       return DocumentLanguage.BG_BG;
469 
470     case "cs":
471       return DocumentLanguage.CS_CZ;
472 
473     case "mt":
474       return DocumentLanguage.MT_MT;
475 
476     case "de":
477       return DocumentLanguage.DE_DE;
478 
479     case "hr":
480       return DocumentLanguage.HR_HR;
481 
482     case "sk":
483       return DocumentLanguage.SK_SK;
484 
485     case "sr":
486       return DocumentLanguage.SR_SR;
487 
488     case "ca":
489       return DocumentLanguage.CA_CA;
490 
491     case "mk":
492       return DocumentLanguage.MK_MK;
493 
494     case "ko":
495       return DocumentLanguage.KO_KR;
496 
497     case "nl":
498       return DocumentLanguage.NL_NL;
499 
500     case "sv":
501       return DocumentLanguage.SV_SE;
502 
503     case "it":
504       return DocumentLanguage.IT_IT;
505 
506     case "tr":
507       return DocumentLanguage.TR_TR;
508 
509     case "fr":
510       return DocumentLanguage.FR_FR;
511 
512     case "ro":
513       return DocumentLanguage.RO_RO;
514 
515     // German Switzerland special case
516     case "de-ch":
517       return DocumentLanguage.DE_CH;
518 
519     // Austrian German fallback
520     case "de-at":
521       Log.warn(PdfUtil.class, "Warning: locale '{}' not directly supported, falling back to DE_DE",
522           bcp47);
523 
524       return DocumentLanguage.DE_DE;
525     }
526 
527     // Try language-based fallback for region variants
528     String language = locale.getLanguage();
529 
530     if (language != null) {
531       String languageLower = language.toLowerCase(ULocale.ROOT.toLocale());
532 
533       switch (languageLower) {
534       case "en":
535         Log.warn(PdfUtil.class,
536             "Warning: locale '{}' not directly supported, falling back to EN_US", bcp47);
537 
538         return DocumentLanguage.EN_US;
539 
540       case "de":
541         Log.warn(PdfUtil.class,
542             "Warning: locale '" + bcp47 + "' not directly supported, falling back to DE_DE");
543 
544         return DocumentLanguage.DE_DE;
545 
546       case "fr":
547         Log.warn(PdfUtil.class,
548             "Warning: locale '" + bcp47 + "' not directly supported, falling back to FR_FR");
549 
550         return DocumentLanguage.FR_FR;
551 
552       case "es":
553         Log.warn(PdfUtil.class,
554             "Warning: locale '" + bcp47 + "' not directly supported, falling back to ES_ES");
555 
556         return DocumentLanguage.ES_ES;
557 
558       case "it":
559         Log.warn(PdfUtil.class,
560             "Warning: locale '" + bcp47 + "' not directly supported, falling back to IT_IT");
561 
562         return DocumentLanguage.IT_IT;
563 
564       case "pt":
565         Log.warn(PdfUtil.class,
566             "Warning: locale '" + bcp47 + "' not directly supported, falling back to PT_BR");
567 
568         return DocumentLanguage.PT_BR;
569 
570       case "zh":
571         Log.warn(PdfUtil.class,
572             "Warning: locale '" + bcp47 + "' not directly supported, falling back to ZH_CN");
573 
574         return DocumentLanguage.ZH_CN;
575 
576       case "nb":
577       case "no":
578       case "nn":
579         Log.warn(PdfUtil.class,
580             "Warning: locale '" + bcp47 + "' not directly supported, falling back to NO_NO");
581 
582         return DocumentLanguage.NO_NO;
583       }
584     }
585 
586     // Final fallback
587     Log.warn(PdfUtil.class, "Warning: locale '" + bcp47
588         + "' not supported by Adobe PDF Services, falling back to EN_US");
589 
590     return DocumentLanguage.EN_US;
591   }
592 
593   /**
594    * Convert PDF to DOCX using Adobe PDF Services
595    */
596   public static InputStream convertPdfToDocx(InputStream pdfInputStream, LocaleId locale,
597       OcrMode ocrMode) throws Exception {
598     try {
599       // Upload PDF asset
600       Asset asset = pdfServices.upload(pdfInputStream, PDFServicesMediaType.PDF.getMediaType());
601 
602       ExportPDFParams.Builder paramsBuilder = ExportPDFParams
603           .exportPDFParamsBuilder(ExportPDFTargetFormat.DOCX);
604 
605       if (ocrMode == OcrMode.ENABLED) {
606         // Perform OCR for the source locale
607         paramsBuilder.withExportOCRLocale(PdfUtil.toAdobeLocale(locale));
608       }
609 
610       ExportPDFParams exportParams = paramsBuilder.build();
611 
612       // Create and submit export job
613       ExportPDFJob exportJob = new ExportPDFJob(asset, exportParams);
614       String location = pdfServices.submit(exportJob);
615 
616       // Get job result
617       PDFServicesResponse<ExportPDFResult> response = pdfServices.getJobResult(location,
618           ExportPDFResult.class);
619 
620       // Download DOCX content
621       Asset resultAsset = response.getResult().getAsset();
622       StreamAsset streamAsset = pdfServices.getContent(resultAsset);
623 
624       return streamAsset.getInputStream();
625 
626     } catch (Exception e) {
627       throw new OkapiIOException("Adobe PDF Services conversion failed", e);
628     }
629   }
630 
631   /**
632    * Convert DOCX to PDF using Adobe PDF Services
633    */
634   public static InputStream convertDocxToPdf(InputStream docxInputStream, LocaleId locale)
635       throws Exception {
636     try {
637       Log.info(PdfUtil.class, "Uploading DOCX to Adobe PDF Services...");
638 
639       // Upload DOCX asset
640       Asset asset = pdfServices.upload(docxInputStream, PDFServicesMediaType.DOCX.getMediaType());
641 
642       Log.info(PdfUtil.class, "Creating PDF from DOCX...");
643 
644       // Create PDF parameters
645       CreatePDFParams createParams = CreatePDFParams.wordParamsBuilder()
646           .withDocumentLanguage(PdfUtil.getDocumentLanguage(locale))
647           .build();
648 
649       // Create and submit job
650       CreatePDFJob createJob = new CreatePDFJob(asset).setParams(createParams);
651       String location = pdfServices.submit(createJob);
652 
653       Log.info(PdfUtil.class, "Waiting for PDF creation job to complete...");
654 
655       // Get job result
656       PDFServicesResponse<CreatePDFResult> response = pdfServices.getJobResult(location,
657           CreatePDFResult.class);
658 
659       // Download PDF content
660       Asset resultAsset = response.getResult().getAsset();
661       StreamAsset streamAsset = pdfServices.getContent(resultAsset);
662 
663       Log.info(PdfUtil.class, "PDF created successfully");
664 
665       return streamAsset.getInputStream();
666 
667     } catch (Exception e) {
668       throw new OkapiIOException("Adobe PDF Services creation failed", e);
669     }
670   }
671 
672   /**
673    * Get the PDF Services instance for the writer to use
674    */
675   public static PDFServices getPdfServices() {
676     return pdfServices;
677   }
678 }