001/*
002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.scheme.hadoop;
023
024import java.beans.ConstructorProperties;
025import java.io.IOException;
026import java.nio.charset.Charset;
027
028import cascading.flow.FlowProcess;
029import cascading.management.annotation.Property;
030import cascading.management.annotation.PropertyDescription;
031import cascading.management.annotation.Visibility;
032import cascading.scheme.SinkCall;
033import cascading.scheme.SourceCall;
034import cascading.scheme.util.DelimitedParser;
035import cascading.tap.CompositeTap;
036import cascading.tap.Tap;
037import cascading.tap.TapException;
038import cascading.tap.hadoop.Hfs;
039import cascading.tuple.Fields;
040import cascading.tuple.Tuple;
041import cascading.tuple.TupleEntry;
042import cascading.tuple.util.TupleViews;
043import org.apache.hadoop.conf.Configuration;
044import org.apache.hadoop.io.LongWritable;
045import org.apache.hadoop.io.Text;
046import org.apache.hadoop.mapred.OutputCollector;
047import org.apache.hadoop.mapred.RecordReader;
048
049/**
050 * Class TextDelimited is a sub-class of {@link TextLine}. It provides direct support for delimited text files, like
051 * TAB (\t) or COMMA (,) delimited files. It also optionally allows for quoted values.
052 * <p>
053 * TextDelimited may also be used to skip the "header" in a file, where the header is defined as the very first line
054 * in every input file. That is, if the byte offset of the current line from the input is zero (0), that line will
055 * be skipped.
056 * <p>
057 * It is assumed if sink/source {@code fields} is set to either {@link Fields#ALL} or {@link Fields#UNKNOWN} and
058 * {@code skipHeader} or {@code hasHeader} is {@code true}, the field names will be retrieved from the header of the
059 * file and used during planning. The header will parsed with the same rules as the body of the file.
060 * <p>
061 * By default headers are not skipped.
062 * <p>
063 * TextDelimited may also be used to write a "header" in a file. The fields names for the header are taken directly
064 * from the declared fields. Or if the declared fields are {@link Fields#ALL} or {@link Fields#UNKNOWN}, the
065 * resolved field names will be used, if any.
066 * <p>
067 * By default headers are not written.
068 * <p>
069 * If {@code hasHeaders} is set to {@code true} on a constructor, both {@code skipHeader} and {@code writeHeader} will
070 * be set to {@code true}.
071 * <p>
072 * By default this {@link cascading.scheme.Scheme} is both {@code strict} and {@code safe}.
073 * <p>
074 * Strict meaning if a line of text does not parse into the expected number of fields, this class will throw a
075 * {@link TapException}. If strict is {@code false}, then {@link Tuple} will be returned with {@code null} values
076 * for the missing fields.
077 * <p>
078 * Safe meaning if a field cannot be coerced into an expected type, a {@code null} will be used for the value.
079 * If safe is {@code false}, a {@link TapException} will be thrown.
080 * <p>
081 * Also by default, {@code quote} strings are not searched for to improve processing speed. If a file is
082 * COMMA delimited but may have COMMA's in a value, the whole value should be surrounded by the quote string, typically
083 * double quotes ({@literal "}).
084 * <p>
085 * Note all empty fields in a line will be returned as {@code null} unless coerced into a new type.
086 * <p>
087 * This Scheme may source/sink {@link Fields#ALL}, when given on the constructor the new instance will automatically
088 * default to strict == false as the number of fields parsed are arbitrary or unknown. A type array may not be given
089 * either, so all values will be returned as Strings.
090 * <p>
091 * By default, all text is encoded/decoded as UTF-8. This can be changed via the {@code charsetName} constructor
092 * argument.
093 * <p>
094 * To override field and line parsing behaviors, sub-class {@link DelimitedParser} or provide a
095 * {@link cascading.scheme.util.FieldTypeResolver} implementation.
096 * <p>
097 * Note that there should be no expectation that TextDelimited, or specifically {@link DelimitedParser}, can handle
098 * all delimited and quoted combinations reliably. Attempting to do so would impair its performance and maintainability.
099 * <p>
100 * Further, it can be safely said any corrupted files will not be supported for obvious reasons. Corrupted files may
101 * result in exceptions or could cause edge cases in the underlying java regular expression engine.
102 * <p>
103 * A large part of Cascading was designed to help users cleans data. Thus the recommendation is to create Flows that
104 * are responsible for cleansing large data-sets when faced with the problem
105 * <p>
106 * DelimitedParser maybe sub-classed and extended if necessary.
107 *
108 * @see TextLine
109 */
110public class TextDelimited extends TextLine
111  {
112  public static final String DEFAULT_CHARSET = "UTF-8";
113
114  /** Field delimitedParser */
115  protected final DelimitedParser delimitedParser;
116  /** Field skipHeader */
117  private boolean skipHeader;
118  private final boolean writeHeader;
119
120  /**
121   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
122   * {@link Fields#ALL} and using TAB as the default delimiter.
123   * <p>
124   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
125   * with a {@link cascading.pipe.Checkpoint} Tap.
126   */
127  public TextDelimited()
128    {
129    this( Fields.ALL, null, "\t", null, null );
130    }
131
132  /**
133   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
134   * {@link Fields#ALL} and using TAB as the default delimiter.
135   * <p>
136   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
137   * with a {@link cascading.pipe.Checkpoint} Tap.
138   *
139   * @param hasHeader of type boolean
140   * @param delimiter of type String
141   */
142  @ConstructorProperties({"hasHeader", "delimiter"})
143  public TextDelimited( boolean hasHeader, String delimiter )
144    {
145    this( Fields.ALL, null, hasHeader, delimiter, null, (Class[]) null );
146    }
147
148  /**
149   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
150   * {@link Fields#ALL} and using TAB as the default delimiter.
151   * <p>
152   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
153   * with a {@link cascading.pipe.Checkpoint} Tap.
154   *
155   * @param hasHeader of type boolean
156   * @param delimiter of type String
157   * @param quote     of type String
158   */
159  @ConstructorProperties({"hasHeader", "delimiter", "quote"})
160  public TextDelimited( boolean hasHeader, String delimiter, String quote )
161    {
162    this( Fields.ALL, null, hasHeader, delimiter, quote, (Class[]) null );
163    }
164
165  /**
166   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
167   * {@link Fields#ALL} and using the given delimitedParser instance for parsing.
168   * <p>
169   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
170   * with a {@link cascading.pipe.Checkpoint} Tap.
171   *
172   * @param hasHeader       of type boolean
173   * @param delimitedParser of type DelimitedParser
174   */
175  @ConstructorProperties({"hasHeader", "delimitedParser"})
176  public TextDelimited( boolean hasHeader, DelimitedParser delimitedParser )
177    {
178    this( Fields.ALL, null, hasHeader, hasHeader, delimitedParser );
179    }
180
181  /**
182   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
183   * {@link Fields#ALL} and using the given delimitedParser instance for parsing.
184   * <p>
185   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
186   * with a {@link cascading.pipe.Checkpoint} Tap.
187   * <p>
188   * This constructor will set {@code skipHeader} and {@code writeHeader} values to true.
189   *
190   * @param delimitedParser of type DelimitedParser
191   */
192  @ConstructorProperties({"delimitedParser"})
193  public TextDelimited( DelimitedParser delimitedParser )
194    {
195    this( Fields.ALL, null, true, true, delimitedParser );
196    }
197
198  /**
199   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
200   * {@link Fields#ALL} and using the given delimitedParser instance for parsing.
201   * <p>
202   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
203   * with a {@link cascading.pipe.Checkpoint} Tap.
204   *
205   * @param sinkCompression of type Compress
206   * @param hasHeader       of type boolean
207   * @param delimitedParser of type DelimitedParser
208   */
209  @ConstructorProperties({"sinkCompression", "hasHeader", "delimitedParser"})
210  public TextDelimited( Compress sinkCompression, boolean hasHeader, DelimitedParser delimitedParser )
211    {
212    this( Fields.ALL, sinkCompression, hasHeader, hasHeader, delimitedParser );
213    }
214
215  /**
216   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
217   * {@link Fields#ALL} and using the given delimitedParser instance for parsing.
218   * <p>
219   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
220   * with a {@link cascading.pipe.Checkpoint} Tap.
221   * <p>
222   * This constructor will set {@code skipHeader} and {@code writeHeader} values to true.
223   *
224   * @param delimitedParser of type DelimitedParser
225   */
226  @ConstructorProperties({"sinkCompression", "delimitedParser"})
227  public TextDelimited( Compress sinkCompression, DelimitedParser delimitedParser )
228    {
229    this( Fields.ALL, sinkCompression, true, true, delimitedParser );
230    }
231
232  /**
233   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
234   * {@link Fields#ALL} and using TAB as the default delimiter.
235   * <p>
236   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
237   * with a {@link cascading.pipe.Checkpoint} Tap.
238   *
239   * @param sinkCompression of type Compress
240   * @param hasHeader       of type boolean
241   * @param delimiter       of type String
242   * @param quote           of type String
243   */
244  @ConstructorProperties({"sinkCompression", "hasHeader", "delimiter", "quote"})
245  public TextDelimited( Compress sinkCompression, boolean hasHeader, String delimiter, String quote )
246    {
247    this( Fields.ALL, sinkCompression, hasHeader, delimiter, quote, (Class[]) null );
248    }
249
250  /**
251   * Constructor TextDelimited creates a new TextDelimited instance with TAB as the default delimiter.
252   *
253   * @param fields of type Fields
254   */
255  @ConstructorProperties({"fields"})
256  public TextDelimited( Fields fields )
257    {
258    this( fields, null, "\t", null, null );
259    }
260
261  /**
262   * Constructor TextDelimited creates a new TextDelimited instance.
263   *
264   * @param fields    of type Fields
265   * @param delimiter of type String
266   */
267  @ConstructorProperties({"fields", "delimiter"})
268  public TextDelimited( Fields fields, String delimiter )
269    {
270    this( fields, null, delimiter, null, null );
271    }
272
273  /**
274   * Constructor TextDelimited creates a new TextDelimited instance.
275   *
276   * @param fields    of type Fields
277   * @param hasHeader of type boolean
278   * @param delimiter of type String
279   */
280  @ConstructorProperties({"fields", "hasHeader", "delimiter"})
281  public TextDelimited( Fields fields, boolean hasHeader, String delimiter )
282    {
283    this( fields, null, hasHeader, hasHeader, delimiter, null, null );
284    }
285
286  /**
287   * Constructor TextDelimited creates a new TextDelimited instance.
288   *
289   * @param fields      of type Fields
290   * @param skipHeader  of type boolean
291   * @param writeHeader of type boolean
292   * @param delimiter   of type String
293   */
294  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter"})
295  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter )
296    {
297    this( fields, null, skipHeader, writeHeader, delimiter, null, null );
298    }
299
300  /**
301   * Constructor TextDelimited creates a new TextDelimited instance.
302   *
303   * @param fields    of type Fields
304   * @param delimiter of type String
305   * @param types     of type Class[]
306   */
307  @ConstructorProperties({"fields", "delimiter", "types"})
308  public TextDelimited( Fields fields, String delimiter, Class[] types )
309    {
310    this( fields, null, delimiter, null, types );
311    }
312
313  /**
314   * Constructor TextDelimited creates a new TextDelimited instance.
315   *
316   * @param fields    of type Fields
317   * @param hasHeader of type boolean
318   * @param delimiter of type String
319   * @param types     of type Class[]
320   */
321  @ConstructorProperties({"fields", "hasHeader", "delimiter", "types"})
322  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, Class[] types )
323    {
324    this( fields, null, hasHeader, hasHeader, delimiter, null, types );
325    }
326
327  /**
328   * Constructor TextDelimited creates a new TextDelimited instance.
329   *
330   * @param fields      of type Fields
331   * @param skipHeader  of type boolean
332   * @param writeHeader of type boolean
333   * @param delimiter   of type String
334   * @param types       of type Class[]
335   */
336  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "types"})
337  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types )
338    {
339    this( fields, null, skipHeader, writeHeader, delimiter, null, types );
340    }
341
342  /**
343   * Constructor TextDelimited creates a new TextDelimited instance.
344   *
345   * @param fields    of type Fields
346   * @param delimiter of type String
347   * @param quote     of type String
348   * @param types     of type Class[]
349   */
350  @ConstructorProperties({"fields", "delimiter", "quote", "types"})
351  public TextDelimited( Fields fields, String delimiter, String quote, Class[] types )
352    {
353    this( fields, null, delimiter, quote, types );
354    }
355
356  /**
357   * Constructor TextDelimited creates a new TextDelimited instance.
358   *
359   * @param fields    of type Fields
360   * @param hasHeader of type boolean
361   * @param delimiter of type String
362   * @param quote     of type String
363   * @param types     of type Class[]
364   */
365  @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types"})
366  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types )
367    {
368    this( fields, null, hasHeader, hasHeader, delimiter, quote, types );
369    }
370
371  /**
372   * Constructor TextDelimited creates a new TextDelimited instance.
373   *
374   * @param fields      of type Fields
375   * @param skipHeader  of type boolean
376   * @param writeHeader of type boolean
377   * @param delimiter   of type String
378   * @param quote       of type String
379   * @param types       of type Class[]
380   */
381  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote", "types"})
382  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types )
383    {
384    this( fields, null, skipHeader, writeHeader, delimiter, quote, types );
385    }
386
387  /**
388   * Constructor TextDelimited creates a new TextDelimited instance.
389   *
390   * @param fields    of type Fields
391   * @param delimiter of type String
392   * @param quote     of type String
393   * @param types     of type Class[]
394   * @param safe      of type boolean
395   */
396  @ConstructorProperties({"fields", "delimiter", "quote", "types", "safe"})
397  public TextDelimited( Fields fields, String delimiter, String quote, Class[] types, boolean safe )
398    {
399    this( fields, null, delimiter, quote, types, safe );
400    }
401
402  /**
403   * Constructor TextDelimited creates a new TextDelimited instance.
404   *
405   * @param fields    of type Fields
406   * @param hasHeader of type boolean
407   * @param delimiter of type String
408   * @param quote     of type String
409   * @param types     of type Class[]
410   * @param safe      of type boolean
411   */
412  @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types", "safe"})
413  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe )
414    {
415    this( fields, null, hasHeader, hasHeader, delimiter, quote, types, safe );
416    }
417
418  /**
419   * Constructor TextDelimited creates a new TextDelimited instance.
420   *
421   * @param fields      of type Fields
422   * @param hasHeader   of type boolean
423   * @param delimiter   of type String
424   * @param quote       of type String
425   * @param types       of type Class[]
426   * @param safe        of type boolean
427   * @param charsetName of type String
428   */
429  @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types", "safe", "charsetName"})
430  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe, String charsetName )
431    {
432    this( fields, null, hasHeader, hasHeader, delimiter, true, quote, types, safe, charsetName );
433    }
434
435  /**
436   * Constructor TextDelimited creates a new TextDelimited instance.
437   *
438   * @param fields      of type Fields
439   * @param skipHeader  of type boolean
440   * @param writeHeader of type boolean
441   * @param delimiter   of type String
442   * @param quote       of type String
443   * @param types       of type Class[]
444   * @param safe        of type boolean
445   */
446  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote", "types", "safe"})
447  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types, boolean safe )
448    {
449    this( fields, null, skipHeader, writeHeader, delimiter, quote, types, safe );
450    }
451
452  /**
453   * Constructor TextDelimited creates a new TextDelimited instance.
454   *
455   * @param fields          of type Fields
456   * @param sinkCompression of type Compress
457   * @param delimiter       of type String
458   */
459  @ConstructorProperties({"fields", "sinkCompression", "delimiter"})
460  public TextDelimited( Fields fields, Compress sinkCompression, String delimiter )
461    {
462    this( fields, sinkCompression, delimiter, null, null );
463    }
464
465  /**
466   * Constructor TextDelimited creates a new TextDelimited instance.
467   *
468   * @param fields          of type Fields
469   * @param sinkCompression of type Compress
470   * @param hasHeader       of type boolean
471   * @param delimiter       of type String
472   */
473  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter"})
474  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter )
475    {
476    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, null, null );
477    }
478
479  /**
480   * Constructor TextDelimited creates a new TextDelimited instance.
481   *
482   * @param fields          of type Fields
483   * @param sinkCompression of type Compress
484   * @param skipHeader      of type boolean
485   * @param writeHeader     of type boolean
486   * @param delimiter       of type String
487   */
488  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter"})
489  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter )
490    {
491    this( fields, sinkCompression, skipHeader, writeHeader, delimiter, null, null );
492    }
493
494  /**
495   * Constructor TextDelimited creates a new TextDelimited instance.
496   *
497   * @param fields          of type Fields
498   * @param sinkCompression of type Compress
499   * @param delimiter       of type String
500   * @param types           of type Class[]
501   */
502  @ConstructorProperties({"fields", "sinkCompression", "delimiter", "types"})
503  public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, Class[] types )
504    {
505    this( fields, sinkCompression, delimiter, null, types );
506    }
507
508  /**
509   * Constructor TextDelimited creates a new TextDelimited instance.
510   *
511   * @param fields          of type Fields
512   * @param sinkCompression of type Compress
513   * @param hasHeader       of type boolean
514   * @param delimiter       of type String
515   * @param types           of type Class[]
516   */
517  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "types"})
518  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, Class[] types )
519    {
520    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, null, types );
521    }
522
523  /**
524   * Constructor TextDelimited creates a new TextDelimited instance.
525   *
526   * @param fields          of type Fields
527   * @param sinkCompression of type Compress
528   * @param skipHeader      of type boolean
529   * @param writeHeader     of type boolean
530   * @param delimiter       of type String
531   * @param types           of type Class[]
532   */
533  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "types"})
534  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types )
535    {
536    this( fields, sinkCompression, skipHeader, writeHeader, delimiter, null, types );
537    }
538
539  /**
540   * Constructor TextDelimited creates a new TextDelimited instance.
541   *
542   * @param fields          of type Fields
543   * @param sinkCompression of type Compress
544   * @param delimiter       of type String
545   * @param types           of type Class[]
546   * @param safe            of type boolean
547   */
548  @ConstructorProperties({"fields", "sinkCompression", "delimiter", "types", "safe"})
549  public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, Class[] types, boolean safe )
550    {
551    this( fields, sinkCompression, delimiter, null, types, safe );
552    }
553
554  /**
555   * Constructor TextDelimited creates a new TextDelimited instance.
556   *
557   * @param fields          of type Fields
558   * @param sinkCompression of type Compress
559   * @param hasHeader       of type boolean
560   * @param delimiter       of type String
561   * @param types           of type Class[]
562   * @param safe            of type boolean
563   */
564  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "types", "safe"})
565  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, Class[] types, boolean safe )
566    {
567    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, null, types, safe );
568    }
569
570  /**
571   * Constructor TextDelimited creates a new TextDelimited instance.
572   *
573   * @param fields          of type Fields
574   * @param sinkCompression of type Compress
575   * @param hasHeader       of type boolean
576   * @param delimiter       of type String
577   * @param types           of type Class[]
578   * @param safe            of type boolean
579   * @param charsetName     of type String
580   */
581  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "types", "safe", "charsetName"})
582  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, Class[] types, boolean safe, String charsetName )
583    {
584    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, null, types, safe, charsetName );
585    }
586
587  /**
588   * Constructor TextDelimited creates a new TextDelimited instance.
589   *
590   * @param fields          of type Fields
591   * @param sinkCompression of type Compress
592   * @param skipHeader      of type boolean
593   * @param writeHeader     of type boolean
594   * @param delimiter       of type String
595   * @param types           of type Class[]
596   * @param safe            of type boolean
597   */
598  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "types", "safe"})
599  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types, boolean safe )
600    {
601    this( fields, sinkCompression, skipHeader, writeHeader, delimiter, null, types, safe );
602    }
603
604  /**
605   * Constructor TextDelimited creates a new TextDelimited instance.
606   *
607   * @param fields    of type Fields
608   * @param delimiter of type String
609   * @param quote     of type String
610   */
611  @ConstructorProperties({"fields", "delimiter", "quote"})
612  public TextDelimited( Fields fields, String delimiter, String quote )
613    {
614    this( fields, null, delimiter, quote );
615    }
616
617  /**
618   * Constructor TextDelimited creates a new TextDelimited instance.
619   *
620   * @param fields    of type Fields
621   * @param hasHeader of type boolean
622   * @param delimiter of type String
623   * @param quote     of type String
624   */
625  @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote"})
626  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote )
627    {
628    this( fields, null, hasHeader, hasHeader, delimiter, quote );
629    }
630
631  /**
632   * Constructor TextDelimited creates a new TextDelimited instance.
633   *
634   * @param fields      of type Fields
635   * @param skipHeader  of type boolean
636   * @param writeHeader of type boolean
637   * @param delimiter   of type String
638   * @param quote       of type String
639   */
640  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote"})
641  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote )
642    {
643    this( fields, null, skipHeader, writeHeader, delimiter, quote );
644    }
645
646  /**
647   * Constructor TextDelimited creates a new TextDelimited instance.
648   *
649   * @param fields          of type Fields
650   * @param sinkCompression of type Compress
651   * @param delimiter       of type String
652   * @param quote           of type String
653   */
654  @ConstructorProperties({"fields", "sinkCompression", "delimiter", "quote"})
655  public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote )
656    {
657    this( fields, sinkCompression, false, false, delimiter, true, quote, null, true );
658    }
659
660  /**
661   * Constructor TextDelimited creates a new TextDelimited instance.
662   *
663   * @param fields          of type Fields
664   * @param sinkCompression of type Compress
665   * @param hasHeader       of type boolean
666   * @param delimiter       of type String
667   * @param quote           of type String
668   */
669  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote"})
670  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote )
671    {
672    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, null, true );
673    }
674
675  /**
676   * Constructor TextDelimited creates a new TextDelimited instance.
677   *
678   * @param fields          of type Fields
679   * @param sinkCompression of type Compress
680   * @param hasHeader       of type boolean
681   * @param delimiter       of type String
682   * @param quote           of type String
683   * @param charsetName     of type String
684   */
685  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote", "charsetName"})
686  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote, String charsetName )
687    {
688    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, null, true, charsetName );
689    }
690
691  /**
692   * Constructor TextDelimited creates a new TextDelimited instance.
693   *
694   * @param fields          of type Fields
695   * @param sinkCompression of type Compress
696   * @param skipHeader      of type boolean
697   * @param writeHeader     of type boolean
698   * @param delimiter       of type String
699   * @param quote           of type String
700   */
701  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "quote"})
702  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, String quote )
703    {
704    this( fields, sinkCompression, skipHeader, writeHeader, delimiter, true, quote, null, true );
705    }
706
707  /**
708   * Constructor TextDelimited creates a new TextDelimited instance.
709   *
710   * @param fields          of type Fields
711   * @param sinkCompression of type Compress
712   * @param delimiter       of type String
713   * @param quote           of type String
714   * @param types           of type Class[]
715   */
716  @ConstructorProperties({"fields", "sinkCompression", "delimiter", "quote", "types"})
717  public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote, Class[] types )
718    {
719    this( fields, sinkCompression, false, false, delimiter, true, quote, types, true );
720    }
721
722  /**
723   * Constructor TextDelimited creates a new TextDelimited instance.
724   *
725   * @param fields          of type Fields
726   * @param sinkCompression of type Compress
727   * @param hasHeader       of type boolean
728   * @param delimiter       of type String
729   * @param quote           of type String
730   * @param types           of type Class[]
731   */
732  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote", "types"})
733  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote, Class[] types )
734    {
735    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, types, true );
736    }
737
738  /**
739   * Constructor TextDelimited creates a new TextDelimited instance.
740   *
741   * @param fields          of type Fields
742   * @param sinkCompression of type Compress
743   * @param skipHeader      of type boolean
744   * @param writeHeader     of type boolean
745   * @param delimiter       of type String
746   * @param quote           of type String
747   * @param types           of type Class[]
748   */
749  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "quote", "types"})
750  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types )
751    {
752    this( fields, sinkCompression, skipHeader, writeHeader, delimiter, true, quote, types, true );
753    }
754
755  /**
756   * Constructor TextDelimited creates a new TextDelimited instance.
757   *
758   * @param fields          of type Fields
759   * @param sinkCompression of type Compress
760   * @param delimiter       of type String
761   * @param quote           of type String
762   * @param types           of type Class[]
763   * @param safe            of type boolean
764   */
765  @ConstructorProperties({"fields", "sinkCompression", "delimiter", "quote", "types", "safe"})
766  public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote, Class[] types, boolean safe )
767    {
768    this( fields, sinkCompression, false, false, delimiter, true, quote, types, safe );
769    }
770
771  /**
772   * Constructor TextDelimited creates a new TextDelimited instance.
773   *
774   * @param fields          of type Fields
775   * @param sinkCompression of type Compress
776   * @param hasHeader       of type boolean
777   * @param delimiter       of type String
778   * @param quote           of type String
779   * @param types           of type Class[]
780   * @param safe            of type boolean
781   */
782  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote", "types", "safe"})
783  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe )
784    {
785    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, types, safe );
786    }
787
788  /**
789   * Constructor TextDelimited creates a new TextDelimited instance.
790   *
791   * @param fields          of type Fields
792   * @param sinkCompression of type Compress
793   * @param skipHeader      of type boolean
794   * @param writeHeader     of type boolean
795   * @param delimiter       of type String
796   * @param quote           of type String
797   * @param types           of type Class[]
798   * @param safe            of type boolean
799   */
800  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "quote", "types",
801                          "safe"})
802  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types, boolean safe )
803    {
804    this( fields, sinkCompression, skipHeader, writeHeader, delimiter, true, quote, types, safe );
805    }
806
807  /**
808   * Constructor TextDelimited creates a new TextDelimited instance.
809   *
810   * @param fields          of type Fields
811   * @param sinkCompression of type Compress
812   * @param skipHeader      of type boolean
813   * @param delimiter       of type String
814   * @param strict          of type boolean
815   * @param quote           of type String
816   * @param types           of type Class[]
817   * @param safe            of type boolean
818   */
819  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "strict", "quote",
820                          "types", "safe"})
821  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe )
822    {
823    this( fields, sinkCompression, skipHeader, writeHeader, delimiter, strict, quote, types, safe, DEFAULT_CHARSET );
824    }
825
826  /**
827   * Constructor TextDelimited creates a new TextDelimited instance.
828   *
829   * @param fields          of type Fields
830   * @param sinkCompression of type Compress
831   * @param skipHeader      of type boolean
832   * @param delimiter       of type String
833   * @param strict          of type boolean
834   * @param quote           of type String
835   * @param types           of type Class[]
836   * @param safe            of type boolean
837   * @param charsetName     of type String
838   */
839  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "strict", "quote",
840                          "types", "safe", "charsetName"})
841  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe, String charsetName )
842    {
843    this( fields, sinkCompression, skipHeader, writeHeader, charsetName, new DelimitedParser( delimiter, quote, types, strict, safe ) );
844    }
845
846  /**
847   * Constructor TextDelimited creates a new TextDelimited instance.
848   *
849   * @param fields          of type Fields
850   * @param writeHeader     of type boolean
851   * @param delimitedParser of type DelimitedParser
852   */
853  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimitedParser"})
854  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, DelimitedParser delimitedParser )
855    {
856    this( fields, null, skipHeader, writeHeader, null, delimitedParser );
857    }
858
859  /**
860   * Constructor TextDelimited creates a new TextDelimited instance.
861   *
862   * @param fields          of type Fields
863   * @param hasHeader       of type boolean
864   * @param delimitedParser of type DelimitedParser
865   */
866  @ConstructorProperties({"fields", "hasHeader", "delimitedParser"})
867  public TextDelimited( Fields fields, boolean hasHeader, DelimitedParser delimitedParser )
868    {
869    this( fields, null, hasHeader, hasHeader, null, delimitedParser );
870    }
871
872  /**
873   * Constructor TextDelimited creates a new TextDelimited instance.
874   *
875   * @param fields          of type Fields
876   * @param writeHeader     of type boolean
877   * @param delimitedParser of type DelimitedParser
878   */
879  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimitedParser"})
880  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, DelimitedParser delimitedParser )
881    {
882    this( fields, sinkCompression, skipHeader, writeHeader, null, delimitedParser );
883    }
884
885  /**
886   * Constructor TextDelimited creates a new TextDelimited instance.
887   *
888   * @param fields          of type Fields
889   * @param sinkCompression of type Compress
890   * @param skipHeader      of type boolean
891   * @param writeHeader     of type boolean
892   * @param charsetName     of type String
893   * @param delimitedParser of type DelimitedParser
894   */
895  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "charsetName", "delimitedParser"})
896  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String charsetName, DelimitedParser delimitedParser )
897    {
898    super( sinkCompression );
899
900    this.delimitedParser = delimitedParser;
901
902    // normalizes ALL and UNKNOWN
903    setSinkFields( fields );
904    setSourceFields( fields );
905
906    this.skipHeader = skipHeader;
907    this.writeHeader = writeHeader;
908
909    // throws an exception if not found
910    setCharsetName( charsetName );
911    }
912
913  /**
914   * Method getDelimiter returns the delimiter used to parse fields from the current line of text.
915   *
916   * @return a String
917   */
918  @Property(name = "delimiter", visibility = Visibility.PUBLIC)
919  @PropertyDescription("The delimiter used to separate fields.")
920  public String getDelimiter()
921    {
922    return delimitedParser.getDelimiter();
923    }
924
925  /**
926   * Method getQuote returns the quote string, if any, used to encapsulate each field in a line to delimited text.
927   *
928   * @return a String
929   */
930  @Property(name = "quote", visibility = Visibility.PUBLIC)
931  @PropertyDescription("The string used for quoting.")
932  public String getQuote()
933    {
934    return delimitedParser.getQuote();
935    }
936
937  @Override
938  public boolean isSymmetrical()
939    {
940    return super.isSymmetrical() && skipHeader == writeHeader;
941    }
942
943  @Override
944  public void setSinkFields( Fields sinkFields )
945    {
946    super.setSourceFields( sinkFields );
947    super.setSinkFields( sinkFields );
948
949    if( delimitedParser != null )
950      delimitedParser.reset( getSourceFields(), getSinkFields() );
951    }
952
953  @Override
954  public void setSourceFields( Fields sourceFields )
955    {
956    super.setSourceFields( sourceFields );
957    super.setSinkFields( sourceFields );
958
959    if( delimitedParser != null )
960      delimitedParser.reset( getSourceFields(), getSinkFields() );
961    }
962
963  @Override
964  public Fields retrieveSourceFields( FlowProcess<? extends Configuration> flowProcess, Tap tap )
965    {
966    if( !skipHeader || !getSourceFields().isUnknown() )
967      return getSourceFields();
968
969    // no need to open them all
970    if( tap instanceof CompositeTap )
971      tap = (Tap) ( (CompositeTap) tap ).getChildTaps().next();
972
973    // should revert to file:// (Lfs) if tap is Lfs
974    tap = new Hfs( new TextLine( new Fields( "line" ), charsetName ), tap.getFullIdentifier( flowProcess ) );
975
976    setSourceFields( delimitedParser.parseFirstLine( flowProcess, tap ) );
977
978    return getSourceFields();
979    }
980
981  @Override
982  public void presentSourceFields( FlowProcess<? extends Configuration> flowProcess, Tap tap, Fields fields )
983    {
984    presentSourceFieldsInternal( fields );
985    }
986
987  @Override
988  public void presentSinkFields( FlowProcess<? extends Configuration> flowProcess, Tap tap, Fields fields )
989    {
990    presentSinkFieldsInternal( fields );
991    }
992
993  @Override
994  public void sourcePrepare( FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall )
995    {
996    super.sourcePrepare( flowProcess, sourceCall );
997
998    sourceCall.getIncomingEntry().setTuple( TupleViews.createObjectArray() );
999    }
1000
1001  @Override
1002  public boolean source( FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall ) throws IOException
1003    {
1004    Object[] context = sourceCall.getContext();
1005
1006    if( !sourceCall.getInput().next( context[ 0 ], context[ 1 ] ) )
1007      return false;
1008
1009    if( skipHeader && ( (LongWritable) context[ 0 ] ).get() == 0 )
1010      {
1011      if( !sourceCall.getInput().next( context[ 0 ], context[ 1 ] ) )
1012        return false;
1013      }
1014
1015    // delegate coercion to delimitedParser for robustness
1016    Object[] split = delimitedParser.parseLine( makeEncodedString( context ) );
1017    Tuple tuple = sourceCall.getIncomingEntry().getTuple();
1018
1019    TupleViews.reset( tuple, split );
1020
1021    return true;
1022    }
1023
1024  @Override
1025  public void sinkPrepare( FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall ) throws IOException
1026    {
1027    sinkCall.setContext( new Object[ 3 ] );
1028
1029    sinkCall.getContext()[ 0 ] = new Text();
1030    sinkCall.getContext()[ 1 ] = new StringBuilder( 4 * 1024 );
1031    sinkCall.getContext()[ 2 ] = Charset.forName( charsetName );
1032
1033    if( writeHeader )
1034      writeHeader( sinkCall );
1035    }
1036
1037  protected void writeHeader( SinkCall<Object[], OutputCollector> sinkCall ) throws IOException
1038    {
1039    Fields fields = sinkCall.getOutgoingEntry().getFields();
1040
1041    Text text = (Text) sinkCall.getContext()[ 0 ];
1042    StringBuilder line = (StringBuilder) sinkCall.getContext()[ 1 ];
1043    Charset charset = (Charset) sinkCall.getContext()[ 2 ];
1044
1045    line = (StringBuilder) delimitedParser.joinFirstLine( fields, line );
1046
1047    text.set( line.toString().getBytes( charset ) );
1048
1049    sinkCall.getOutput().collect( null, text );
1050
1051    line.setLength( 0 );
1052    }
1053
1054  @Override
1055  public void sink( FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall ) throws IOException
1056    {
1057    TupleEntry tupleEntry = sinkCall.getOutgoingEntry();
1058
1059    Text text = (Text) sinkCall.getContext()[ 0 ];
1060    StringBuilder line = (StringBuilder) sinkCall.getContext()[ 1 ];
1061    Charset charset = (Charset) sinkCall.getContext()[ 2 ];
1062
1063    Iterable<String> strings = tupleEntry.asIterableOf( String.class );
1064
1065    line = (StringBuilder) delimitedParser.joinLine( strings, line );
1066
1067    text.set( line.toString().getBytes( charset ) );
1068
1069    sinkCall.getOutput().collect( null, text );
1070
1071    line.setLength( 0 );
1072    }
1073
1074  @Override
1075  public String getExtension()
1076    {
1077    switch( getDelimiter().trim() )
1078      {
1079      case "\t":
1080        return "tsv";
1081
1082      case ",":
1083        return "csv";
1084      }
1085
1086    return "txt";
1087    }
1088  }
1089