001/*
002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.scheme.local;
023
024import java.beans.ConstructorProperties;
025import java.io.FileOutputStream;
026import java.io.IOException;
027import java.io.InputStream;
028import java.io.InputStreamReader;
029import java.io.LineNumberReader;
030import java.io.OutputStream;
031import java.io.OutputStreamWriter;
032import java.io.PrintWriter;
033import java.io.UnsupportedEncodingException;
034import java.nio.charset.Charset;
035import java.util.Properties;
036
037import cascading.flow.FlowProcess;
038import cascading.management.annotation.Property;
039import cascading.management.annotation.PropertyDescription;
040import cascading.management.annotation.Visibility;
041import cascading.scheme.FileFormat;
042import cascading.scheme.SinkCall;
043import cascading.scheme.SourceCall;
044import cascading.scheme.util.DelimitedParser;
045import cascading.tap.CompositeTap;
046import cascading.tap.SinkMode;
047import cascading.tap.Tap;
048import cascading.tap.TapException;
049import cascading.tap.local.FileTap;
050import cascading.tuple.Fields;
051import cascading.tuple.Tuple;
052import cascading.tuple.TupleEntry;
053import cascading.tuple.util.TupleViews;
054
055/**
056 * Class TextDelimited provides direct support for delimited text files, like
057 * TAB (\t) or COMMA (,) delimited files. It also optionally allows for quoted values.
058 * <p>
059 * TextDelimited may also be used to skip the "header" in a file, where the header is defined as the very first line
060 * in every input file. That is, if the byte offset of the current line from the input is zero (0), that line will
061 * be skipped.
062 * <p>
063 * It is assumed if sink/source {@code fields} is set to either {@link Fields#ALL} or {@link Fields#UNKNOWN} and
064 * {@code skipHeader} or {@code hasHeader} is {@code true}, the field names will be retrieved from the header of the
065 * file and used during planning. The header will parsed with the same rules as the body of the file.
066 * <p>
067 * By default headers are not skipped.
068 * <p>
069 * TextDelimited may also be used to write a "header" in a file. The fields names for the header are taken directly
070 * from the declared fields. Or if the declared fields are {@link Fields#ALL} or {@link Fields#UNKNOWN}, the
071 * resolved field names will be used, if any.
072 * <p>
073 * By default headers are not written.
074 * <p>
075 * If {@code hasHeaders} is set to {@code true} on a constructor, both {@code skipHeader} and {@code writeHeader} will
076 * be set to {@code true}.
077 * <p>
078 * By default this {@link cascading.scheme.Scheme} is both {@code strict} and {@code safe}.
079 * <p>
080 * Strict meaning if a line of text does not parse into the expected number of fields, this class will throw a
081 * {@link TapException}. If strict is {@code false}, then {@link Tuple} will be returned with {@code null} values
082 * for the missing fields.
083 * <p>
084 * Safe meaning if a field cannot be coerced into an expected type, a {@code null} will be used for the value.
085 * If safe is {@code false}, a {@link TapException} will be thrown.
086 * <p>
087 * Also by default, {@code quote} strings are not searched for to improve processing speed. If a file is
088 * COMMA delimited but may have COMMA's in a value, the whole value should be surrounded by the quote string, typically
089 * double quotes ({@literal "}).
090 * <p>
091 * Note all empty fields in a line will be returned as {@code null} unless coerced into a new type.
092 * <p>
093 * This Scheme may source/sink {@link Fields#ALL}, when given on the constructor the new instance will automatically
094 * default to strict == false as the number of fields parsed are arbitrary or unknown. A type array may not be given
095 * either, so all values will be returned as Strings.
096 * <p>
097 * By default, all text is encoded/decoded as UTF-8. This can be changed via the {@code charsetName} constructor
098 * argument.
099 * <p>
100 * To override field and line parsing behaviors, sub-class {@link DelimitedParser} or provide a
101 * {@link cascading.scheme.util.FieldTypeResolver} implementation.
102 * <p>
103 * Note that there should be no expectation that TextDelimited, or specifically {@link DelimitedParser}, can handle
104 * all delimited and quoted combinations reliably. Attempting to do so would impair its performance and maintainability.
105 * <p>
106 * Further, it can be safely said any corrupted files will not be supported for obvious reasons. Corrupted files may
107 * result in exceptions or could cause edge cases in the underlying java regular expression engine.
108 * <p>
109 * A large part of Cascading was designed to help users cleans data. Thus the recommendation is to create Flows that
110 * are responsible for cleansing large data-sets when faced with the problem.
111 * <p>
112 * DelimitedParser maybe sub-classed and extended if necessary.
113 * <p>
114 * In order to read or write a compressed files, pass a {@link cascading.scheme.local.CompressorScheme.Compressor}
115 * instance to the appropriate constructors. See {@link Compressors} for provided compression algorithms.
116 *
117 * @see TextLine
118 * @see Compressors
119 */
120public class TextDelimited extends CompressorScheme<LineNumberReader, PrintWriter> implements FileFormat
121  {
122  public static final String DEFAULT_CHARSET = "UTF-8";
123
124  private final boolean skipHeader;
125  private final boolean writeHeader;
126  private final DelimitedParser delimitedParser;
127  private String charsetName = DEFAULT_CHARSET;
128
129  /**
130   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
131   * {@link Fields#ALL} and using TAB as the default delimiter.
132   * <p>
133   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
134   * with a {@link cascading.pipe.Checkpoint} Tap.
135   */
136  public TextDelimited()
137    {
138    this( Fields.ALL );
139    }
140
141  /**
142   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
143   * {@link Fields#ALL} and using TAB as the default delimiter.
144   * <p>
145   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
146   * with a {@link cascading.pipe.Checkpoint} Tap.
147   *
148   * @param hasHeader
149   * @param delimiter
150   */
151  @ConstructorProperties({"hasHeader", "delimiter"})
152  public TextDelimited( boolean hasHeader, String delimiter )
153    {
154    this( Fields.ALL, hasHeader, delimiter, null, (Class[]) null );
155    }
156
157  /**
158   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
159   * {@link Fields#ALL} and using TAB as the default delimiter.
160   * <p>
161   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
162   * with a {@link cascading.pipe.Checkpoint} Tap.
163   *
164   * @param hasHeader
165   * @param delimiter
166   * @param quote
167   */
168  @ConstructorProperties({"hasHeader", "delimiter", "quote"})
169  public TextDelimited( boolean hasHeader, String delimiter, String quote )
170    {
171    this( Fields.ALL, hasHeader, delimiter, quote, (Class[]) null );
172    }
173
174  /**
175   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
176   * {@link Fields#ALL} and using the given delimitedParser instance for parsing.
177   * <p>
178   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
179   * with a {@link cascading.pipe.Checkpoint} Tap.
180   *
181   * @param hasHeader
182   * @param delimitedParser
183   */
184  @ConstructorProperties({"hasHeader", "delimitedParser"})
185  public TextDelimited( boolean hasHeader, DelimitedParser delimitedParser )
186    {
187    this( Fields.ALL, hasHeader, hasHeader, delimitedParser );
188    }
189
190  /**
191   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
192   * {@link Fields#ALL} and using the given delimitedParser instance for parsing.
193   * <p>
194   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
195   * with a {@link cascading.pipe.Checkpoint} Tap.
196   * <p>
197   * This constructor will set {@code skipHeader} and {@code writeHeader} values to true.
198   *
199   * @param delimitedParser
200   */
201  @ConstructorProperties({"delimitedParser"})
202  public TextDelimited( DelimitedParser delimitedParser )
203    {
204    this( Fields.ALL, true, true, delimitedParser );
205    }
206
207  /**
208   * Constructor TextDelimited creates a new TextDelimited instance with TAB as the default delimiter.
209   *
210   * @param fields of type Fields
211   */
212  @ConstructorProperties({"fields"})
213  public TextDelimited( Fields fields )
214    {
215    this( fields, "\t", null, null );
216    }
217
218  /**
219   * Constructor TextDelimited creates a new TextDelimited instance.
220   *
221   * @param fields    of type Fields
222   * @param delimiter of type String
223   */
224  @ConstructorProperties({"fields", "delimiter"})
225  public TextDelimited( Fields fields, String delimiter )
226    {
227    this( fields, delimiter, null, null );
228    }
229
230  /**
231   * Constructor TextDelimited creates a new TextDelimited instance.
232   *
233   * @param fields    of type Fields
234   * @param hasHeader of type boolean
235   * @param delimiter of type String
236   */
237  @ConstructorProperties({"fields", "hasHeader", "delimiter"})
238  public TextDelimited( Fields fields, boolean hasHeader, String delimiter )
239    {
240    this( fields, hasHeader, hasHeader, delimiter, null, null );
241    }
242
243  /**
244   * Constructor TextDelimited creates a new TextDelimited instance.
245   *
246   * @param fields     of type Fields
247   * @param skipHeader of type boolean
248   * @param delimiter  of type String
249   */
250  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter"})
251  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter )
252    {
253    this( fields, skipHeader, writeHeader, delimiter, null, null );
254    }
255
256  /**
257   * Constructor TextDelimited creates a new TextDelimited instance.
258   *
259   * @param fields    of type Fields
260   * @param delimiter of type String
261   * @param types     of type Class[]
262   */
263  @ConstructorProperties({"fields", "delimiter", "types"})
264  public TextDelimited( Fields fields, String delimiter, Class[] types )
265    {
266    this( fields, delimiter, null, types );
267    }
268
269  /**
270   * Constructor TextDelimited creates a new TextDelimited instance.
271   *
272   * @param fields    of type Fields
273   * @param hasHeader of type boolean
274   * @param delimiter of type String
275   * @param types     of type Class[]
276   */
277  @ConstructorProperties({"fields", "hasHeader", "delimiter", "types"})
278  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, Class[] types )
279    {
280    this( fields, hasHeader, hasHeader, delimiter, null, types );
281    }
282
283  /**
284   * Constructor TextDelimited creates a new TextDelimited instance.
285   *
286   * @param fields      of type Fields
287   * @param skipHeader  of type boolean
288   * @param writeHeader of type boolean
289   * @param delimiter   of type String
290   * @param types       of type Class[]
291   */
292  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "types"})
293  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types )
294    {
295    this( fields, skipHeader, writeHeader, delimiter, null, types );
296    }
297
298  /**
299   * Constructor TextDelimited creates a new TextDelimited instance.
300   *
301   * @param fields    of type Fields
302   * @param delimiter of type String
303   * @param quote     of type String
304   * @param types     of type Class[]
305   */
306  @ConstructorProperties({"fields", "delimiter", "quote", "types"})
307  public TextDelimited( Fields fields, String delimiter, String quote, Class[] types )
308    {
309    this( fields, false, delimiter, quote, types );
310    }
311
312  /**
313   * Constructor TextDelimited creates a new TextDelimited instance.
314   *
315   * @param fields    of type Fields
316   * @param hasHeader of type boolean
317   * @param delimiter of type String
318   * @param quote     of type String
319   * @param types     of type Class[]
320   */
321  @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types"})
322  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types )
323    {
324    this( fields, hasHeader, hasHeader, delimiter, quote, types, true );
325    }
326
327  /**
328   * Constructor TextDelimited creates a new TextDelimited instance.
329   *
330   * @param fields      of type Fields
331   * @param skipHeader  of type boolean
332   * @param writeHeader of type boolean
333   * @param delimiter   of type String
334   * @param quote       of type String
335   * @param types       of type Class[]
336   */
337  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote", "types"})
338  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types )
339    {
340    this( fields, skipHeader, writeHeader, delimiter, quote, types, true );
341    }
342
343  /**
344   * Constructor TextDelimited creates a new TextDelimited instance.
345   *
346   * @param fields    of type Fields
347   * @param delimiter of type String
348   * @param quote     of type String
349   * @param types     of type Class[]
350   * @param safe      of type boolean
351   */
352  @ConstructorProperties({"fields", "delimiter", "quote", "types", "safe"})
353  public TextDelimited( Fields fields, String delimiter, String quote, Class[] types, boolean safe )
354    {
355    this( fields, false, delimiter, quote, types, safe );
356    }
357
358  /**
359   * Constructor TextDelimited creates a new TextDelimited instance.
360   *
361   * @param fields    of type Fields
362   * @param hasHeader of type boolean
363   * @param delimiter of type String
364   * @param quote     of type String
365   * @param types     of type Class[]
366   * @param safe      of type boolean
367   */
368  @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types", "safe"})
369  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe )
370    {
371    this( fields, hasHeader, hasHeader, delimiter, true, quote, types, safe );
372    }
373
374  /**
375   * Constructor TextDelimited creates a new TextDelimited instance.
376   *
377   * @param fields      of type Fields
378   * @param hasHeader   of type boolean
379   * @param delimiter   of type String
380   * @param quote       of type String
381   * @param types       of type Class[]
382   * @param safe        of type boolean
383   * @param charsetName of type String
384   */
385  @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types", "safe", "charsetName"})
386  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe, String charsetName )
387    {
388    this( fields, hasHeader, hasHeader, delimiter, true, quote, types, safe, charsetName );
389    }
390
391  /**
392   * Constructor TextDelimited creates a new TextDelimited instance.
393   *
394   * @param fields      of type Fields
395   * @param skipHeader  of type boolean
396   * @param writeHeader of type boolean
397   * @param delimiter   of type String
398   * @param quote       of type String
399   * @param types       of type Class[]
400   * @param safe        of type boolean
401   */
402  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote", "types", "safe"})
403  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types, boolean safe )
404    {
405    this( fields, skipHeader, writeHeader, delimiter, true, quote, types, safe );
406    }
407
408  /**
409   * Constructor TextDelimited creates a new TextDelimited instance.
410   *
411   * @param fields    of type Fields
412   * @param delimiter of type String
413   * @param quote     of type String
414   */
415  @ConstructorProperties({"fields", "delimiter", "quote"})
416  public TextDelimited( Fields fields, String delimiter, String quote )
417    {
418    this( fields, false, delimiter, quote, null, true );
419    }
420
421  /**
422   * Constructor TextDelimited creates a new TextDelimited instance.
423   *
424   * @param fields    of type Fields
425   * @param hasHeader of type boolean
426   * @param delimiter of type String
427   * @param quote     of type String
428   */
429  @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote"})
430  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote )
431    {
432    this( fields, hasHeader, delimiter, quote, null, true );
433    }
434
435  /**
436   * Constructor TextDelimited creates a new TextDelimited instance.
437   *
438   * @param fields      of type Fields
439   * @param hasHeader   of type boolean
440   * @param delimiter   of type String
441   * @param quote       of type String
442   * @param charsetName of type String
443   */
444  @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "charsetName"})
445  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, String charsetName )
446    {
447    this( fields, hasHeader, delimiter, quote, null, true, charsetName );
448    }
449
450  /**
451   * Constructor TextDelimited creates a new TextDelimited instance.
452   *
453   * @param fields      of type Fields
454   * @param skipHeader  of type boolean
455   * @param writeHeader of type boolean
456   * @param delimiter   of type String
457   * @param strict      of type boolean
458   * @param quote       of type String
459   * @param types       of type Class[]
460   * @param safe        of type boolean
461   */
462  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "strict", "quote", "types", "safe"})
463  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe )
464    {
465    this( fields, skipHeader, writeHeader, delimiter, strict, quote, types, safe, DEFAULT_CHARSET );
466    }
467
468  /**
469   * Constructor TextDelimited creates a new TextDelimited instance.
470   *
471   * @param fields      of type Fields
472   * @param skipHeader  of type boolean
473   * @param writeHeader of type boolean
474   * @param delimiter   of type String
475   * @param strict      of type boolean
476   * @param quote       of type String
477   * @param types       of type Class[]
478   * @param safe        of type boolean
479   * @param charsetName of type String
480   */
481  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "strict", "quote", "types", "safe",
482                          "charsetName"})
483  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe, String charsetName )
484    {
485    this( fields, skipHeader, writeHeader, charsetName, new DelimitedParser( delimiter, quote, types, strict, safe ) );
486    }
487
488  /**
489   * Constructor TextDelimited creates a new TextDelimited instance.
490   *
491   * @param fields          of type Fields
492   * @param writeHeader     of type boolean
493   * @param delimitedParser of type DelimitedParser
494   */
495  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimitedParser"})
496  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, DelimitedParser delimitedParser )
497    {
498    this( fields, skipHeader, writeHeader, null, delimitedParser );
499    }
500
501  /**
502   * Constructor TextDelimited creates a new TextDelimited instance.
503   *
504   * @param fields          of type Fields
505   * @param hasHeader       of type boolean
506   * @param delimitedParser of type DelimitedParser
507   */
508  @ConstructorProperties({"fields", "hasHeader", "delimitedParser"})
509  public TextDelimited( Fields fields, boolean hasHeader, DelimitedParser delimitedParser )
510    {
511    this( fields, hasHeader, hasHeader, null, delimitedParser );
512    }
513
514  /**
515   * Constructor TextDelimited creates a new TextDelimited instance.
516   *
517   * @param fields          of type Fields
518   * @param writeHeader     of type boolean
519   * @param charsetName     of type String
520   * @param delimitedParser of type DelimitedParser
521   */
522  @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "charsetName", "delimitedParser"})
523  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String charsetName, DelimitedParser delimitedParser )
524    {
525    this( fields, null, skipHeader, writeHeader, charsetName, delimitedParser );
526    }
527
528  /**
529   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
530   * {@link Fields#ALL} and using TAB as the default delimiter.
531   * <p>
532   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
533   * with a {@link cascading.pipe.Checkpoint} Tap.
534   *
535   * @param compressor of type Compressor, see {@link Compressors}
536   */
537  @ConstructorProperties("compressor")
538  public TextDelimited( Compressor compressor )
539    {
540    this( Fields.ALL, compressor );
541    }
542
543  /**
544   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
545   * {@link Fields#ALL} and using TAB as the default delimiter.
546   * <p>
547   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
548   * with a {@link cascading.pipe.Checkpoint} Tap.
549   *
550   * @param compressor of type Compressor, see {@link Compressors}
551   * @param hasHeader
552   * @param delimiter
553   */
554  @ConstructorProperties({"compressor", "hasHeader", "delimiter"})
555  public TextDelimited( Compressor compressor, boolean hasHeader, String delimiter )
556    {
557    this( Fields.ALL, compressor, hasHeader, delimiter, null, (Class[]) null );
558    }
559
560  /**
561   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
562   * {@link Fields#ALL} and using TAB as the default delimiter.
563   * <p>
564   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
565   * with a {@link cascading.pipe.Checkpoint} Tap.
566   *
567   * @param compressor of type Compressor, see {@link Compressors}
568   * @param hasHeader
569   * @param delimiter
570   * @param quote
571   */
572  @ConstructorProperties({"compressor", "hasHeader", "delimiter", "quote"})
573  public TextDelimited( Compressor compressor, boolean hasHeader, String delimiter, String quote )
574    {
575    this( Fields.ALL, compressor, hasHeader, delimiter, quote, (Class[]) null );
576    }
577
578  /**
579   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
580   * {@link Fields#ALL} and using the given delimitedParser instance for parsing.
581   * <p>
582   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
583   * with a {@link cascading.pipe.Checkpoint} Tap.
584   *
585   * @param compressor      of type Compressor, see {@link Compressors}
586   * @param hasHeader
587   * @param delimitedParser
588   */
589  @ConstructorProperties({"compressor", "hasHeader", "delimitedParser"})
590  public TextDelimited( Compressor compressor, boolean hasHeader, DelimitedParser delimitedParser )
591    {
592    this( Fields.ALL, compressor, hasHeader, hasHeader, delimitedParser );
593    }
594
595  /**
596   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
597   * {@link Fields#ALL} and using the given delimitedParser instance for parsing.
598   * <p>
599   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
600   * with a {@link cascading.pipe.Checkpoint} Tap.
601   * <p>
602   * This constructor will set {@code skipHeader} and {@code writeHeader} values to true.
603   *
604   * @param compressor      of type Compressor, see {@link Compressors}
605   * @param delimitedParser
606   */
607  @ConstructorProperties({"compressor", "delimitedParser"})
608  public TextDelimited( Compressor compressor, DelimitedParser delimitedParser )
609    {
610    this( Fields.ALL, compressor, true, true, delimitedParser );
611    }
612
613  /**
614   * Constructor TextDelimited creates a new TextDelimited instance with TAB as the default delimiter.
615   *
616   * @param fields     of type Fields
617   * @param compressor of type Compressor, see {@link Compressors}
618   */
619  @ConstructorProperties({"fields", "compressor"})
620  public TextDelimited( Fields fields, Compressor compressor )
621    {
622    this( fields, compressor, "\t", null, null );
623    }
624
625  /**
626   * Constructor TextDelimited creates a new TextDelimited instance.
627   *
628   * @param fields     of type Fields
629   * @param compressor of type Compressor, see {@link Compressors}
630   * @param delimiter  of type String
631   */
632  @ConstructorProperties({"fields", "compressor", "delimiter"})
633  public TextDelimited( Fields fields, Compressor compressor, String delimiter )
634    {
635    this( fields, compressor, delimiter, null, null );
636    }
637
638  /**
639   * Constructor TextDelimited creates a new TextDelimited instance.
640   *
641   * @param fields     of type Fields
642   * @param compressor of type Compressor, see {@link Compressors}
643   * @param hasHeader  of type boolean
644   * @param delimiter  of type String
645   */
646  @ConstructorProperties({"fields", "compressor", "hasHeader", "delimiter"})
647  public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, String delimiter )
648    {
649    this( fields, compressor, hasHeader, hasHeader, delimiter, null, null );
650    }
651
652  /**
653   * Constructor TextDelimited creates a new TextDelimited instance.
654   *
655   * @param fields     of type Fields
656   * @param compressor of type Compressor, see {@link Compressors}
657   * @param skipHeader of type boolean
658   * @param delimiter  of type String
659   */
660  @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "delimiter"})
661  public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, String delimiter )
662    {
663    this( fields, compressor, skipHeader, writeHeader, delimiter, null, null );
664    }
665
666  /**
667   * Constructor TextDelimited creates a new TextDelimited instance.
668   *
669   * @param fields     of type Fields
670   * @param compressor of type Compressor, see {@link Compressors}
671   * @param delimiter  of type String
672   * @param types      of type Class[]
673   */
674  @ConstructorProperties({"fields", "compressor", "delimiter", "types"})
675  public TextDelimited( Fields fields, Compressor compressor, String delimiter, Class[] types )
676    {
677    this( fields, compressor, delimiter, null, types );
678    }
679
680  /**
681   * Constructor TextDelimited creates a new TextDelimited instance.
682   *
683   * @param fields     of type Fields
684   * @param compressor of type Compressor, see {@link Compressors}
685   * @param hasHeader  of type boolean
686   * @param delimiter  of type String
687   * @param types      of type Class[]
688   */
689  @ConstructorProperties({"fields", "compressor", "hasHeader", "delimiter", "types"})
690  public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, String delimiter, Class[] types )
691    {
692    this( fields, compressor, hasHeader, hasHeader, delimiter, null, types );
693    }
694
695  /**
696   * Constructor TextDelimited creates a new TextDelimited instance.
697   *
698   * @param fields      of type Fields
699   * @param compressor  of type Compressor, see {@link Compressors}
700   * @param skipHeader  of type boolean
701   * @param writeHeader of type boolean
702   * @param delimiter   of type String
703   * @param types       of type Class[]
704   */
705  @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "delimiter", "types"})
706  public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types )
707    {
708    this( fields, compressor, skipHeader, writeHeader, delimiter, null, types );
709    }
710
711  /**
712   * Constructor TextDelimited creates a new TextDelimited instance.
713   *
714   * @param fields     of type Fields
715   * @param compressor of type Compressor, see {@link Compressors}
716   * @param delimiter  of type String
717   * @param quote      of type String
718   * @param types      of type Class[]
719   */
720  @ConstructorProperties({"fields", "compressor", "delimiter", "quote", "types"})
721  public TextDelimited( Fields fields, Compressor compressor, String delimiter, String quote, Class[] types )
722    {
723    this( fields, compressor, false, delimiter, quote, types );
724    }
725
726  /**
727   * Constructor TextDelimited creates a new TextDelimited instance.
728   *
729   * @param fields     of type Fields
730   * @param compressor of type Compressor, see {@link Compressors}
731   * @param hasHeader  of type boolean
732   * @param delimiter  of type String
733   * @param quote      of type String
734   * @param types      of type Class[]
735   */
736  @ConstructorProperties({"fields", "compressor", "hasHeader", "delimiter", "quote", "types"})
737  public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, String delimiter, String quote, Class[] types )
738    {
739    this( fields, compressor, hasHeader, hasHeader, delimiter, quote, types, true );
740    }
741
742  /**
743   * Constructor TextDelimited creates a new TextDelimited instance.
744   *
745   * @param fields      of type Fields
746   * @param compressor  of type Compressor, see {@link Compressors}
747   * @param skipHeader  of type boolean
748   * @param writeHeader of type boolean
749   * @param delimiter   of type String
750   * @param quote       of type String
751   * @param types       of type Class[]
752   */
753  @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "delimiter", "quote", "types"})
754  public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types )
755    {
756    this( fields, compressor, skipHeader, writeHeader, delimiter, quote, types, true );
757    }
758
759  /**
760   * Constructor TextDelimited creates a new TextDelimited instance.
761   *
762   * @param fields     of type Fields
763   * @param compressor of type Compressor, see {@link Compressors}
764   * @param delimiter  of type String
765   * @param quote      of type String
766   * @param types      of type Class[]
767   * @param safe       of type boolean
768   */
769  @ConstructorProperties({"fields", "compressor", "delimiter", "quote", "types", "safe"})
770  public TextDelimited( Fields fields, Compressor compressor, String delimiter, String quote, Class[] types, boolean safe )
771    {
772    this( fields, compressor, false, delimiter, quote, types, safe );
773    }
774
775  /**
776   * Constructor TextDelimited creates a new TextDelimited instance.
777   *
778   * @param fields     of type Fields
779   * @param compressor of type Compressor, see {@link Compressors}
780   * @param hasHeader  of type boolean
781   * @param delimiter  of type String
782   * @param quote      of type String
783   * @param types      of type Class[]
784   * @param safe       of type boolean
785   */
786  @ConstructorProperties({"fields", "compressor", "hasHeader", "delimiter", "quote", "types", "safe"})
787  public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe )
788    {
789    this( fields, compressor, hasHeader, hasHeader, delimiter, true, quote, types, safe );
790    }
791
792  /**
793   * Constructor TextDelimited creates a new TextDelimited instance.
794   *
795   * @param fields      of type Fields
796   * @param compressor  of type Compressor, see {@link Compressors}
797   * @param hasHeader   of type boolean
798   * @param delimiter   of type String
799   * @param quote       of type String
800   * @param types       of type Class[]
801   * @param safe        of type boolean
802   * @param charsetName of type String
803   */
804  @ConstructorProperties({"fields", "compressor", "hasHeader", "delimiter", "quote", "types", "safe", "charsetName"})
805  public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe, String charsetName )
806    {
807    this( fields, compressor, hasHeader, hasHeader, delimiter, true, quote, types, safe, charsetName );
808    }
809
810  /**
811   * Constructor TextDelimited creates a new TextDelimited instance.
812   *
813   * @param fields      of type Fields
814   * @param compressor  of type Compressor, see {@link Compressors}
815   * @param skipHeader  of type boolean
816   * @param writeHeader of type boolean
817   * @param delimiter   of type String
818   * @param quote       of type String
819   * @param types       of type Class[]
820   * @param safe        of type boolean
821   */
822  @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "delimiter", "quote", "types", "safe"})
823  public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types, boolean safe )
824    {
825    this( fields, compressor, skipHeader, writeHeader, delimiter, true, quote, types, safe );
826    }
827
828  /**
829   * Constructor TextDelimited creates a new TextDelimited instance.
830   *
831   * @param fields     of type Fields
832   * @param compressor of type Compressor, see {@link Compressors}
833   * @param delimiter  of type String
834   * @param quote      of type String
835   */
836  @ConstructorProperties({"fields", "compressor", "delimiter", "quote"})
837  public TextDelimited( Fields fields, Compressor compressor, String delimiter, String quote )
838    {
839    this( fields, compressor, false, delimiter, quote, null, true );
840    }
841
842  /**
843   * Constructor TextDelimited creates a new TextDelimited instance.
844   *
845   * @param fields     of type Fields
846   * @param compressor of type Compressor, see {@link Compressors}
847   * @param hasHeader  of type boolean
848   * @param delimiter  of type String
849   * @param quote      of type String
850   */
851  @ConstructorProperties({"fields", "compressor", "hasHeader", "delimiter", "quote"})
852  public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, String delimiter, String quote )
853    {
854    this( fields, compressor, hasHeader, delimiter, quote, null, true );
855    }
856
857  /**
858   * Constructor TextDelimited creates a new TextDelimited instance.
859   *
860   * @param fields      of type Fields
861   * @param compressor  of type Compressor, see {@link Compressors}
862   * @param hasHeader   of type boolean
863   * @param delimiter   of type String
864   * @param quote       of type String
865   * @param charsetName of type String
866   */
867  @ConstructorProperties({"fields", "compressor", "hasHeader", "delimiter", "quote", "charsetName"})
868  public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, String delimiter, String quote, String charsetName )
869    {
870    this( fields, compressor, hasHeader, delimiter, quote, null, true, charsetName );
871    }
872
873  /**
874   * Constructor TextDelimited creates a new TextDelimited instance.
875   *
876   * @param fields      of type Fields
877   * @param compressor  of type Compressor, see {@link Compressors}
878   * @param skipHeader  of type boolean
879   * @param writeHeader of type boolean
880   * @param delimiter   of type String
881   * @param strict      of type boolean
882   * @param quote       of type String
883   * @param types       of type Class[]
884   * @param safe        of type boolean
885   */
886  @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "delimiter", "strict", "quote", "types",
887                          "safe"})
888  public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe )
889    {
890    this( fields, compressor, skipHeader, writeHeader, delimiter, strict, quote, types, safe, DEFAULT_CHARSET );
891    }
892
893  /**
894   * Constructor TextDelimited creates a new TextDelimited instance.
895   *
896   * @param fields      of type Fields
897   * @param compressor  of type Compressor, see {@link Compressors}
898   * @param skipHeader  of type boolean
899   * @param writeHeader of type boolean
900   * @param delimiter   of type String
901   * @param strict      of type boolean
902   * @param quote       of type String
903   * @param types       of type Class[]
904   * @param safe        of type boolean
905   * @param charsetName of type String
906   */
907  @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "delimiter", "strict", "quote", "types",
908                          "safe", "charsetName"})
909  public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe, String charsetName )
910    {
911    this( fields, compressor, skipHeader, writeHeader, charsetName, new DelimitedParser( delimiter, quote, types, strict, safe ) );
912    }
913
914  /**
915   * Constructor TextDelimited creates a new TextDelimited instance.
916   *
917   * @param fields          of type Fields
918   * @param compressor      of type Compressor, see {@link Compressors}
919   * @param writeHeader     of type boolean
920   * @param delimitedParser of type DelimitedParser
921   */
922  @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "delimitedParser"})
923  public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, DelimitedParser delimitedParser )
924    {
925    this( fields, compressor, skipHeader, writeHeader, null, delimitedParser );
926    }
927
928  /**
929   * Constructor TextDelimited creates a new TextDelimited instance.
930   *
931   * @param fields          of type Fields
932   * @param compressor      of type Compressor, see {@link Compressors}
933   * @param hasHeader       of type boolean
934   * @param delimitedParser of type DelimitedParser
935   */
936  @ConstructorProperties({"fields", "compressor", "hasHeader", "delimitedParser"})
937  public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, DelimitedParser delimitedParser )
938    {
939    this( fields, compressor, hasHeader, hasHeader, null, delimitedParser );
940    }
941
942  /**
943   * Constructor TextDelimited creates a new TextDelimited instance.
944   *
945   * @param fields          of type Fields
946   * @param compressor      of type Compressor, see {@link Compressors}
947   * @param compressor      of type Compressor, see {@link Compressors}
948   * @param writeHeader     of type boolean
949   * @param charsetName     of type String
950   * @param delimitedParser of type DelimitedParser
951   */
952  @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "charsetName", "delimitedParser"})
953  public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, String charsetName, DelimitedParser delimitedParser )
954    {
955    super( fields, fields, compressor );
956
957    this.delimitedParser = delimitedParser;
958
959    // normalizes ALL and UNKNOWN
960    // calls reset on delimitedParser
961    setSourceFields( fields );
962    setSinkFields( fields );
963
964    this.skipHeader = skipHeader;
965    this.writeHeader = writeHeader;
966
967    if( charsetName != null )
968      this.charsetName = charsetName;
969
970    // throws an exception if not found
971    Charset.forName( this.charsetName );
972    }
973
974  @Property(name = "charset", visibility = Visibility.PUBLIC)
975  @PropertyDescription("character set used.")
976  public String getCharsetName()
977    {
978    return charsetName;
979    }
980
981  /**
982   * Method getDelimiter returns the delimiter used to parse fields from the current line of text.
983   *
984   * @return a String
985   */
986  @Property(name = "delimiter", visibility = Visibility.PUBLIC)
987  @PropertyDescription("The delimiter used to separate fields.")
988  public String getDelimiter()
989    {
990    return delimitedParser.getDelimiter();
991    }
992
993  /**
994   * Method getQuote returns the quote string, if any, used to encapsulate each field in a line to delimited text.
995   *
996   * @return a String
997   */
998  @Property(name = "quote", visibility = Visibility.PUBLIC)
999  @PropertyDescription("The string used for quoting.")
1000  public String getQuote()
1001    {
1002    return delimitedParser.getQuote();
1003    }
1004
1005  public LineNumberReader createInput( InputStream inputStream )
1006    {
1007    try
1008      {
1009      return new LineNumberReader( new InputStreamReader( inputStream, charsetName ) );
1010      }
1011    catch( UnsupportedEncodingException exception )
1012      {
1013      throw new TapException( exception );
1014      }
1015    }
1016
1017  public PrintWriter createOutput( OutputStream outputStream )
1018    {
1019    try
1020      {
1021      return new PrintWriter( new OutputStreamWriter( outputStream, charsetName ) );
1022      }
1023    catch( UnsupportedEncodingException exception )
1024      {
1025      throw new TapException( exception );
1026      }
1027    }
1028
1029  @Override
1030  public void setSinkFields( Fields sinkFields )
1031    {
1032    super.setSourceFields( sinkFields );
1033    super.setSinkFields( sinkFields );
1034
1035    if( delimitedParser != null )
1036      delimitedParser.reset( getSourceFields(), getSinkFields() );
1037    }
1038
1039  @Override
1040  public void setSourceFields( Fields sourceFields )
1041    {
1042    super.setSourceFields( sourceFields );
1043    super.setSinkFields( sourceFields );
1044
1045    if( delimitedParser != null )
1046      delimitedParser.reset( getSourceFields(), getSinkFields() );
1047    }
1048
1049  @Override
1050  public boolean isSymmetrical()
1051    {
1052    return super.isSymmetrical() && skipHeader == writeHeader;
1053    }
1054
1055  @Override
1056  public Fields retrieveSourceFields( FlowProcess<? extends Properties> process, Tap tap )
1057    {
1058    if( !skipHeader || !getSourceFields().isUnknown() )
1059      return getSourceFields();
1060
1061    // no need to open them all
1062    if( tap instanceof CompositeTap )
1063      tap = (Tap) ( (CompositeTap) tap ).getChildTaps().next();
1064
1065    tap = new FileTap( new TextLine( new Fields( "line" ), charsetName ), tap.getIdentifier() );
1066
1067    setSourceFields( delimitedParser.parseFirstLine( process, tap ) );
1068
1069    return getSourceFields();
1070    }
1071
1072  @Override
1073  public void presentSourceFields( FlowProcess<? extends Properties> process, Tap tap, Fields fields )
1074    {
1075    // do nothing
1076    }
1077
1078  @Override
1079  public void presentSinkFields( FlowProcess<? extends Properties> flowProcess, Tap tap, Fields fields )
1080    {
1081    if( writeHeader )
1082      presentSinkFieldsInternal( fields );
1083    }
1084
1085  @Override
1086  public void sourceConfInit( FlowProcess<? extends Properties> flowProcess, Tap<Properties, InputStream, OutputStream> tap, Properties conf )
1087    {
1088    }
1089
1090  @Override
1091  public void sourcePrepare( FlowProcess<? extends Properties> flowProcess, SourceCall<LineNumberReader, InputStream> sourceCall ) throws IOException
1092    {
1093    sourceCall.setContext( createInput( sourceCall.getInput() ) );
1094
1095    sourceCall.getIncomingEntry().setTuple( TupleViews.createObjectArray() );
1096    }
1097
1098  @Override
1099  public void sourceRePrepare( FlowProcess<? extends Properties> flowProcess, SourceCall<LineNumberReader, InputStream> sourceCall ) throws IOException
1100    {
1101    sourceCall.setContext( createInput( sourceCall.getInput() ) );
1102    }
1103
1104  @Override
1105  public boolean source( FlowProcess<? extends Properties> flowProcess, SourceCall<LineNumberReader, InputStream> sourceCall ) throws IOException
1106    {
1107    String line = sourceCall.getContext().readLine();
1108
1109    if( line == null )
1110      return false;
1111
1112    if( skipHeader && sourceCall.getContext().getLineNumber() == 1 ) // todo: optimize this away
1113      line = sourceCall.getContext().readLine();
1114
1115    if( line == null )
1116      return false;
1117
1118    Object[] split = delimitedParser.parseLine( line );
1119
1120    // assumption it is better to re-use than to construct new
1121    Tuple tuple = sourceCall.getIncomingEntry().getTuple();
1122
1123    TupleViews.reset( tuple, split );
1124
1125    return true;
1126    }
1127
1128  @Override
1129  public void sourceCleanup( FlowProcess<? extends Properties> flowProcess, SourceCall<LineNumberReader, InputStream> sourceCall ) throws IOException
1130    {
1131    sourceCall.setContext( null );
1132    }
1133
1134  @Override
1135  public void sinkConfInit( FlowProcess<? extends Properties> flowProcess, Tap<Properties, InputStream, OutputStream> tap, Properties conf )
1136    {
1137    }
1138
1139  @Override
1140  public void sinkPrepare( FlowProcess<? extends Properties> flowProcess, SinkCall<PrintWriter, OutputStream> sinkCall )
1141    {
1142    OutputStream originalOutput = sinkCall.getOutput();
1143    sinkCall.setContext( createOutput( originalOutput ) );
1144
1145    if( writeHeader && !isAppendingFile( sinkCall, originalOutput ) )
1146      {
1147      Fields fields = sinkCall.getOutgoingEntry().getFields();
1148      delimitedParser.joinFirstLine( fields, sinkCall.getContext() );
1149
1150      sinkCall.getContext().println();
1151      }
1152    }
1153
1154  protected boolean isAppendingFile( SinkCall<PrintWriter, OutputStream> sinkCall, OutputStream originalOutput )
1155    {
1156    try
1157      {
1158      return sinkCall.getTap().getSinkMode() == SinkMode.UPDATE &&
1159        originalOutput instanceof FileOutputStream &&
1160        ( (FileOutputStream) originalOutput ).getChannel().position() != 0;
1161      }
1162    catch( IOException exception )
1163      {
1164      // the error will be thrown immediately downstream
1165      return false;
1166      }
1167    }
1168
1169  @Override
1170  public void sink( FlowProcess<? extends Properties> flowProcess, SinkCall<PrintWriter, OutputStream> sinkCall ) throws IOException
1171    {
1172    TupleEntry tupleEntry = sinkCall.getOutgoingEntry();
1173
1174    Iterable<String> strings = tupleEntry.asIterableOf( String.class );
1175
1176    delimitedParser.joinLine( strings, sinkCall.getContext() );
1177
1178    sinkCall.getContext().println();
1179    }
1180
1181  @Override
1182  public void sinkCleanup( FlowProcess<? extends Properties> flowProcess, SinkCall<PrintWriter, OutputStream> sinkCall )
1183    {
1184    sinkCall.getContext().flush();
1185    sinkCall.setContext( null );
1186    }
1187
1188  @Override
1189  public String getExtension()
1190    {
1191    switch( getDelimiter().trim() )
1192      {
1193      case "\t":
1194        return "tsv";
1195
1196      case ",":
1197        return "csv";
1198      }
1199
1200    return "txt";
1201    }
1202  }