View Javadoc

1   package pl.psnc.dl.ege.tei;
2   
3   import java.io.File;
4   import java.io.FileInputStream;
5   import java.io.FileOutputStream;
6   import java.io.IOException;
7   import java.io.InputStream;
8   import java.io.OutputStream;
9   import java.util.List;
10  import java.util.UUID;
11  import java.util.regex.Pattern;
12  
13  import javax.xml.transform.stream.StreamSource;
14  
15  import net.sf.saxon.s9api.Processor;
16  import net.sf.saxon.s9api.QName;
17  import net.sf.saxon.s9api.SaxonApiException;
18  import net.sf.saxon.s9api.Serializer;
19  import net.sf.saxon.s9api.XdmAtomicValue;
20  import net.sf.saxon.s9api.XsltCompiler;
21  import net.sf.saxon.s9api.XsltExecutable;
22  import net.sf.saxon.s9api.XsltTransformer;
23  
24  import org.apache.log4j.Logger;
25  import org.tei.exceptions.ConfigurationException;
26  import org.tei.tei.DocXTransformationProperties;
27  import org.tei.utils.SaxonProcFactory;
28  
29  import pl.psnc.dl.ege.component.Converter;
30  import pl.psnc.dl.ege.configuration.EGEConfigurationManager;
31  import pl.psnc.dl.ege.configuration.EGEConstants;
32  import pl.psnc.dl.ege.exception.ConverterException;
33  import pl.psnc.dl.ege.types.ConversionActionArguments;
34  import pl.psnc.dl.ege.types.DataType;
35  import pl.psnc.dl.ege.utils.EGEIOUtils;
36  import pl.psnc.dl.ege.utils.IOResolver;
37  
38  /**
39   * <p>
40   * EGE Converter interface implementation
41   * </p>
42   * 
43   * Provides multiple conversions for Enrich TEI format.<br>
44   * <b>Important : </b> the converter expects only compressed data. Data is
45   * compressed with standard EGE IOResolver received from
46   * EGEConfigurationManager.
47   * 
48   * @author mariuszs
49   * 
50   */
51  public class TEIConverter implements Converter {
52  
53  	private static final String EX_NO_FILE_DATA_WAS_FOUND = "No file data was found for conversion";
54  
55  	private static final Logger LOGGER = Logger.getLogger(TEIConverter.class);
56  
57  	public static final String DOCX_ERROR = "Probably trying to convert from DocX with wrong input.";
58  
59  	private IOResolver ior = EGEConfigurationManager.getInstance()
60  			.getStandardIOResolver();
61  
62  	public void convert(InputStream inputStream, OutputStream outputStream,
63  			final ConversionActionArguments conversionDataTypes)
64  			throws ConverterException, IOException {
65  		boolean found = false;
66  		try {
67  			for (ConversionActionArguments cadt : ConverterConfiguration.CONVERSIONS) {
68  				if (conversionDataTypes.equals(cadt)) {
69  					String profile = cadt.getProperties().get(
70  							ConverterConfiguration.PROFILE_KEY);
71  					LOGGER.debug("Converting from : "
72  							+ conversionDataTypes.getInputType().toString()
73  							+ " to "
74  							+ conversionDataTypes.getOutputType().toString());
75  					LOGGER.debug("Selected profile : " + profile);
76  					convertTo(inputStream, outputStream, cadt.getOutputType(),
77  							profile);
78  					found = true;
79  				}
80  			}
81  		} catch (ConfigurationException ex) {
82  			LOGGER.error(ex.getMessage(), ex);
83  			throw new ConverterException(ex.getMessage());
84  		} catch (SaxonApiException ex) {
85  			// return wrong docx input message
86  			if (ex.getMessage() != null
87  					&& ex.getMessage().contains("FileNotFoundException")
88  					&& conversionDataTypes.getInputType().getFormat().equals(
89  							Format.DOCX.getFormatName())
90  					&& conversionDataTypes.getInputType().getMimeType().equals(
91  							Format.DOCX.getMimeType())) {
92  				LOGGER.warn(ex.getMessage(), ex);
93  				throw new ConverterException(DOCX_ERROR);
94  			}
95  			LOGGER.error(ex.getMessage(), ex);
96  			throw new ConverterException(ex.getMessage());
97  		}
98  		if (!found) {
99  			throw new ConverterException(
100 					ConverterException.UNSUPPORTED_CONVERSION_TYPES);
101 		}
102 	}
103 
104 	/*
105 	 * Prepares transformation : based on MIME type.
106 	 */
107 	private void convertTo(InputStream inputStream, OutputStream outputStream,
108 			DataType dataType, String profile) throws IOException,
109 			SaxonApiException, ConfigurationException, ConverterException {
110 		String toMimeType = dataType.getMimeType();
111 		// to DOCX
112 		if (Format.DOCX.getMimeType().equals(toMimeType)) {
113 			if (!ConverterConfiguration.checkProfile(profile, Format.DOCX
114 					.getProfile())) {
115 				LOGGER.warn(ConverterConfiguration.PROFILE_NOT_FOUND_MSG);
116 				profile = ConverterConfiguration.DEFAULT_PROFILE;
117 			}
118 			Processor proc = SaxonProcFactory.getProcessor();
119 			XsltCompiler comp = proc.newXsltCompiler();
120 			transformToDocX(inputStream, outputStream, proc, comp, profile);
121 		}
122 		// to XHTML
123 		else if (Format.XHTML.getMimeType().equals(toMimeType)) {
124 			if (!ConverterConfiguration.checkProfile(profile, Format.DOCX
125 					.getProfile())) {
126 				LOGGER.warn(ConverterConfiguration.PROFILE_NOT_FOUND_MSG);
127 				profile = ConverterConfiguration.DEFAULT_PROFILE;
128 			}
129 
130 			performXsltTransformation(inputStream, outputStream, Format.XHTML
131 					.getProfile(), profile);
132 		}
133 		// to LATEX
134 		else if (Format.LATEX.getMimeType().equals(toMimeType)) {
135 			if (!ConverterConfiguration.checkProfile(profile, Format.DOCX
136 					.getProfile())) {
137 				LOGGER.warn(ConverterConfiguration.PROFILE_NOT_FOUND_MSG);
138 				profile = ConverterConfiguration.DEFAULT_PROFILE;
139 			}
140 			performXsltTransformation(inputStream, outputStream, Format.LATEX
141 					.getProfile(), profile);
142 		}
143 		// to FO
144 		else if (Format.FO.getMimeType().equals(toMimeType)
145 				&& Format.FO.getFormatName().equals(dataType.getFormat())) {
146 			if (!ConverterConfiguration.checkProfile(profile, Format.DOCX
147 					.getProfile())) {
148 				LOGGER.warn(ConverterConfiguration.PROFILE_NOT_FOUND_MSG);
149 				profile = ConverterConfiguration.DEFAULT_PROFILE;
150 			}
151 			performXsltTransformation(inputStream, outputStream, Format.FO
152 					.getProfile(), profile);
153 		}
154 		// from DOCX to TEI
155 		else if (ConverterConfiguration.XML_MIME.equals(toMimeType)
156 				&& dataType.getFormat().equals(ConverterConfiguration.TEI)) {
157 			if (!ConverterConfiguration.checkProfile(profile, Format.DOCX
158 					.getProfile())) {
159 				LOGGER.warn(ConverterConfiguration.PROFILE_NOT_FOUND_MSG);
160 				profile = ConverterConfiguration.DEFAULT_PROFILE;
161 			}
162 			transformFromDocX(inputStream, outputStream, profile);
163 		}
164 	}
165 
166 	/*
167 	 * prepares received data - decompress and open file stream.
168 	 */
169 	private InputStream prepareInputData(InputStream inputStream, File inTempDir)
170 			throws IOException, ConverterException {
171 		ior.decompressStream(inputStream, inTempDir);
172 		// perform transform
173 		File sFile = searchForData(inTempDir, "^.*\\.((?i)xml)$");
174 		if (sFile == null) {
175 			// search for any file
176 			sFile = searchForData(inTempDir, "^.*");
177 			if (sFile == null) {
178 				throw new ConverterException(EX_NO_FILE_DATA_WAS_FOUND);
179 			}
180 		}
181 		FileInputStream fis = new FileInputStream(sFile);
182 		return fis;
183 	}
184 	
185 	/*
186 	 * Search for specified by regex file 
187 	 */
188 	private File searchForData(File dir, String regex) {
189 		for (File f : dir.listFiles()) {
190 			if (!f.isDirectory() && Pattern.matches(regex, f.getName())) {
191 				return f;
192 			} else if (f.isDirectory()) {
193 				File sf = searchForData(f, regex);
194 				if (sf != null) {
195 					return sf;
196 				}
197 			}
198 		}
199 		return null;
200 	}
201 	
202 	private File prepareTempDir() {
203 		File inTempDir = null;
204 		String uid = UUID.randomUUID().toString();
205 		inTempDir = new File(EGEConstants.TEMP_PATH + File.separator + uid
206 				+ File.separator);
207 		inTempDir.mkdir();
208 		return inTempDir;
209 	}
210 	
211 	/*
212 	 * Performs transformation over XSLT 
213 	 */
214 	private void performXsltTransformation(InputStream inputStream,
215 			OutputStream outputStream, String id, String profile)
216 			throws IOException, SaxonApiException, ConverterException {
217 		FileOutputStream fos = null;
218 		InputStream is = null;
219 		File inTmpDir = null;
220 		File outTempDir = null;
221 		try {
222 			inTmpDir = prepareTempDir();
223 			is = prepareInputData(inputStream, inTmpDir);
224 			outTempDir = prepareTempDir();
225 			File resFile = new File(outTempDir + File.separator + "res.ege");
226 			fos = new FileOutputStream(resFile);
227 			Processor proc = SaxonProcFactory.getProcessor();
228 			XsltCompiler comp = proc.newXsltCompiler();
229 			XsltExecutable exec = comp.compile(resolveConfiguration(id, comp,
230 					profile));
231 			XsltTransformer transformer = exec.load();
232 			setTransformationParameters(transformer, id);
233 			transformer.setInitialContextNode(proc.newDocumentBuilder().build(
234 					new StreamSource(is)));
235 			Serializer result = new Serializer();
236 			result.setOutputStream(fos);
237 			transformer.setDestination(result);
238 			transformer.transform();
239 			ior.compressData(outTempDir, outputStream);
240 		} finally {
241 			try {
242 				is.close();
243 			} catch (Exception ex) {
244 				// do nothing
245 			}
246 			try {
247 				fos.close();
248 			} catch (Exception ex) {
249 				// do nothing
250 			}
251 			if (outTempDir != null && outTempDir.exists())
252 				EGEIOUtils.deleteDirectory(outTempDir);
253 			if (inTmpDir != null && inTmpDir.exists())
254 				EGEIOUtils.deleteDirectory(inTmpDir);
255 		}
256 
257 	}
258 
259 	/*
260 	 * Additional parameters for XHTML transformation.
261 	 */
262 	private void setTransformationParameters(XsltTransformer transformer,
263 			String id) {
264 		if (Format.XHTML.getId().equals(id)) {
265 			transformer.setParameter(new QName("STDOUT"), new XdmAtomicValue(
266 					"true"));
267 			transformer.setParameter(new QName("splitLevel"),
268 					new XdmAtomicValue("-1"));
269 			transformer.setParameter(new QName("lang"),
270 					new XdmAtomicValue("en"));
271 			transformer.setParameter(new QName("doclang"), new XdmAtomicValue(
272 					"en"));
273 			transformer.setParameter(new QName("documentationLanguage"),
274 					new XdmAtomicValue("en"));
275 		}
276 	}
277 	
278 	/*
279 	 * Performs from DocX to TEI transformation
280 	 */
281 	private void transformFromDocX(InputStream is, OutputStream os,
282 			String profile) throws IOException, SaxonApiException,
283 			ConfigurationException, ConverterException {
284 		File tmpDir = prepareTempDir();
285 		InputStream fis = null;
286 		DocXConverter docX = null;
287 		try {
288 			docX = new DocXConverter(getDocXConfig(profile), true);
289 			ior.decompressStream(is, tmpDir);
290 			// should contain only single file
291 			File docXFile = searchForData(tmpDir, "^.*\\.((?i)doc|(?i)docx)$");
292 			if (docXFile == null) {
293 				docXFile = searchForData(tmpDir, "^.*");
294 				if (docXFile == null) {
295 					throw new ConverterException(EX_NO_FILE_DATA_WAS_FOUND);
296 				}
297 			}
298 			fis = new FileInputStream(docXFile);
299 			docX.docXToTEI(fis, os);
300 		} finally {
301 			if(fis != null){
302 				try{
303 					fis.close();
304 				}catch(Exception ex){
305 					// do nothing
306 				}
307 			}
308 			if (tmpDir != null) {
309 				EGEIOUtils.deleteDirectory(tmpDir);
310 			}
311 			if(docX != null){
312 				docX.cleanUp();
313 			}
314 		}
315 	}
316 	
317 	/*
318 	 * Performs From TEI to DocX transformation
319 	 */
320 	private void transformToDocX(InputStream is, OutputStream os,
321 			Processor proc, XsltCompiler comp, final String profile)
322 			throws IOException, SaxonApiException, ConfigurationException,
323 			ConverterException {
324 		File inTmpDir = prepareTempDir();
325 		File outTmpDir = prepareTempDir();
326 		InputStream inputStream = prepareInputData(is, inTmpDir);
327 		DocXConverter docX = null;
328 		FileOutputStream fos = null;
329 		try {
330 			docX = new DocXConverter(getDocXConfig(profile));
331 			// perform conversion
332 			docX.mergeTEI(proc.newDocumentBuilder().build(
333 					new StreamSource(inputStream)));
334 			File oDocXFile = new File(outTmpDir.getAbsolutePath() + File.separator + "result.docx");
335 			fos = new FileOutputStream(oDocXFile);
336 			// pack directory to final DocX file
337 			docX.zipToStream(fos, new File(docX.getDirectoryName()));
338 			// double compress DocX file anyway
339 			ior.compressData(outTmpDir, os);
340 			// clean temporary files
341 		} finally {
342 			// perform cleanup
343 			try{
344 				inputStream.close();
345 			}catch(Exception ex){
346 				// do nothing
347 			}
348 			if(fos != null){
349 				try{
350 					fos.close();
351 				}catch(Exception ex){
352 					// do nothing
353 				}
354 			}
355 			if(docX != null){
356 				docX.cleanUp();
357 			}
358 			EGEIOUtils.deleteDirectory(inTmpDir);
359 			EGEIOUtils.deleteDirectory(outTmpDir);
360 		}
361 	}
362 
363 	private DocXTransformationProperties getDocXConfig(final String profile) {
364 		// path for temporary files (cleaned after transformation)
365 		final String tmpDir = new File(ConverterConfiguration.PATH).toString()
366 				+ File.separator + "tei-config" + File.separator + "tmp";
367 
368 		return new DocXTransformationProperties() {
369 
370 			@Override
371 			public File getOutputFile() {
372 				return null;
373 			}
374 
375 			public String docx_pp_getDocXTemplateFile() {
376 				return new File(ConverterConfiguration.STYLESHEETS_PATH)
377 						.toString()
378 						+ File.separator
379 						+ "profiles"
380 						+ File.separator
381 						+ profile
382 						+ File.separator
383 						+ Format.DOCX.getId()
384 						+ File.separator + "template.docx";
385 			}
386 
387 			public String docx_pp_getStylesheetCheckDocx() {
388 				return null;
389 			}
390 
391 			public String docx_pp_getStylesheetDocx2TEI() {
392 				return new File(ConverterConfiguration.STYLESHEETS_PATH)
393 						.toString()
394 						+ File.separator
395 						+ "profiles"
396 						+ File.separator
397 						+ profile
398 						+ File.separator
399 						+ Format.DOCX.getId()
400 						+ File.separator + "from.xsl";
401 			}
402 
403 			public String docx_pp_getStylesheetNormalizeWordStyles() {
404 				return new File(ConverterConfiguration.STYLESHEETS_PATH)
405 						.toString()
406 						+ File.separator
407 						+ Format.DOCX.getId()
408 						+ File.separator
409 						+ "normalize-word-style.xsl";
410 			}
411 
412 			public String docx_pp_getStylesheetTEI2Docx() {
413 				return new File(ConverterConfiguration.STYLESHEETS_PATH)
414 						.toString()
415 						+ File.separator
416 						+ "profiles"
417 						+ File.separator
418 						+ profile
419 						+ File.separator
420 						+ Format.DOCX.getId()
421 						+ File.separator + "to.xsl";
422 			}
423 
424 			public String docx_pp_getTempDir() {
425 				return tmpDir;
426 			}
427 		};
428 	}
429 
430 	/*
431 	 * Setups new URIResolver for XSLT compiler and returns StreamSource of XSL
432 	 * transform scheme.
433 	 */
434 	private StreamSource resolveConfiguration(final String id,
435 			XsltCompiler comp, String profile) throws IOException {
436 		comp.setURIResolver(TEIConverterURIResolver
437 				.newInstance(ConverterConfiguration.PATH + "/" + "tei-config"
438 						+ "/" + "stylesheets" + "/" + "profiles" + "/"
439 						+ profile + "/" + id));
440 		return new StreamSource(new FileInputStream(new File(
441 				ConverterConfiguration.STYLESHEETS_PATH + "profiles"
442 						+ File.separator + profile + File.separator + id
443 						+ File.separator + "to.xsl")));
444 	}
445 
446 	public List<ConversionActionArguments> getPossibleConversions() {
447 		return (List<ConversionActionArguments>) ConverterConfiguration.CONVERSIONS;
448 	}
449 
450 }