View Javadoc
1   package com.acumenvelocity.ath.filters.pdf;
2   
3   import java.io.InputStream;
4   import java.util.ArrayList;
5   import java.util.List;
6   
7   import com.acumenvelocity.ath.common.Log;
8   import com.acumenvelocity.ath.common.PdfUtil;
9   
10  import net.sf.okapi.common.Event;
11  import net.sf.okapi.common.IParameters;
12  import net.sf.okapi.common.LocaleId;
13  import net.sf.okapi.common.MimeTypeMapper;
14  import net.sf.okapi.common.UsingParameters;
15  import net.sf.okapi.common.encoder.EncoderManager;
16  import net.sf.okapi.common.exceptions.OkapiIOException;
17  import net.sf.okapi.common.filters.FilterConfiguration;
18  import net.sf.okapi.common.filters.IFilter;
19  import net.sf.okapi.common.filters.IFilterConfigurationMapper;
20  import net.sf.okapi.common.filterwriter.IFilterWriter;
21  import net.sf.okapi.common.resource.RawDocument;
22  import net.sf.okapi.common.resource.StartDocument;
23  import net.sf.okapi.common.skeleton.GenericSkeletonWriter;
24  import net.sf.okapi.common.skeleton.ISkeletonWriter;
25  import net.sf.okapi.filters.openxml.ConditionalParameters;
26  import net.sf.okapi.filters.openxml.OpenXMLFilter;
27  import net.sf.okapi.filters.openxml.OpenXMLFilterWriter;
28  
29  /**
30   * Implements the IFilter interface for PDF files using Adobe PDF Services.
31   * Converts PDF to DOCX, processes with OpenXML filter, and stores segments in Solr.
32   */
33  @UsingParameters(Parameters.class)
34  public class AthPdfFilter implements IFilter {
35    private EncoderManager encoderManager;
36    private RawDocument input;
37    private OpenXMLFilter docxFilter;
38    private Parameters params;
39    private LocaleId srcLoc;
40  
41    public AthPdfFilter() {
42      super();
43      params = new Parameters();
44    }
45  
46    @Override
47    public void cancel() {
48      if (docxFilter != null) {
49        docxFilter.cancel();
50      }
51    }
52  
53    @Override
54    public void close() {
55      if (input != null) {
56        input.close();
57      }
58  
59      if (docxFilter != null) {
60        docxFilter.close();
61      }
62    }
63  
64    @Override
65    public String getName() {
66      return "okf_pdf";
67    }
68  
69    @Override
70    public String getDisplayName() {
71      return "PDF Filter (Adobe services)";
72    }
73  
74    @Override
75    public String getMimeType() {
76      return MimeTypeMapper.PDF_MIME_TYPE;
77    }
78  
79    @Override
80    public Parameters getParameters() {
81      return params;
82    }
83  
84    @Override
85    public void setParameters(IParameters params) {
86      this.params = (Parameters) params;
87    }
88  
89    @Override
90    public void setFilterConfigurationMapper(IFilterConfigurationMapper fcMapper) {
91    }
92  
93    @Override
94    public ISkeletonWriter createSkeletonWriter() {
95      return new GenericSkeletonWriter();
96    }
97  
98    @Override
99    public IFilterWriter createFilterWriter() {
100     return new AthPdfFilterWriter((OpenXMLFilterWriter) docxFilter.createFilterWriter());
101   }
102 
103   @Override
104   public List<FilterConfiguration> getConfigurations() {
105     List<FilterConfiguration> list = new ArrayList<>();
106 
107     list.add(new FilterConfiguration("okf_pdf", MimeTypeMapper.PDF_MIME_TYPE, getClass().getName(),
108         "PDF (Portable Document Format)",
109         "Configuration for PDF documents using Adobe PDF Services",
110         null, ".pdf;"));
111 
112     return list;
113   }
114 
115   @Override
116   public EncoderManager getEncoderManager() {
117     if (encoderManager == null) {
118       encoderManager = new EncoderManager();
119 
120       encoderManager.setMapping(MimeTypeMapper.PDF_MIME_TYPE,
121           "net.sf.okapi.common.encoder.DefaultEncoder");
122     }
123 
124     return encoderManager;
125   }
126 
127   @Override
128   public void open(RawDocument input) {
129     open(input, true);
130   }
131 
132   @Override
133   public void open(RawDocument input, boolean generateSkeleton) {
134     this.input = input;
135 
136     try {
137       Log.info(getClass(), "Converting PDF to DOCX using Adobe PDF Services...");
138 
139       // Convert PDF to DOCX using Adobe PDF Services
140       InputStream docxInputStream = PdfUtil.convertPdfToDocx(input.getStream(), srcLoc,
141           params.getOcrMode());
142 
143       Log.info(getClass(), "PDF converted to DOCX successfully");
144 
145       // Process DOCX with OpenXML filter
146       docxFilter = new OpenXMLFilter();
147 
148       ConditionalParameters docxParams = new ConditionalParameters();
149       docxParams.fromString(params.toString());
150 
151       docxFilter.setParameters(docxParams);
152 
153       RawDocument docxRawDoc = new RawDocument(
154           docxInputStream,
155           input.getEncoding(),
156           input.getSourceLocale());
157 
158       docxRawDoc.setFilterConfigId("okf_openxml");
159       docxRawDoc.setId(input.getId());
160 
161       docxFilter.open(docxRawDoc, generateSkeleton);
162       Log.info(getClass(), "OpenXML filter opened successfully");
163 
164     } catch (Exception e) {
165       throw new OkapiIOException("Error processing PDF file", e);
166     }
167   }
168 
169   @Override
170   public boolean hasNext() {
171     return docxFilter != null && docxFilter.hasNext();
172   }
173 
174   @Override
175   public Event next() {
176     Event e = docxFilter.next();
177 
178     if (e.isStartDocument()) {
179       StartDocument sd = e.getStartDocument();
180       sd.setMimeType(MimeTypeMapper.PDF_MIME_TYPE);
181       sd.setName(input.getId());
182       this.srcLoc = sd.getLocale();
183       sd.setFilterWriter(createFilterWriter());
184     }
185 
186     return e;
187   }
188 }