001/*
002 * Copyright (c) 2016-2017 Chris K Wensel. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.tap;
023
024import java.io.Closeable;
025import java.io.IOException;
026import java.io.Serializable;
027import java.io.UncheckedIOException;
028import java.util.Set;
029import java.util.Spliterator;
030import java.util.Spliterators;
031import java.util.stream.Stream;
032import java.util.stream.StreamSupport;
033
034import cascading.flow.Flow;
035import cascading.flow.FlowElement;
036import cascading.flow.FlowException;
037import cascading.flow.FlowProcess;
038import cascading.flow.planner.Scope;
039import cascading.flow.planner.ScopedElement;
040import cascading.management.annotation.Property;
041import cascading.management.annotation.PropertyDescription;
042import cascading.management.annotation.PropertySanitizer;
043import cascading.management.annotation.Visibility;
044import cascading.pipe.Pipe;
045import cascading.property.ConfigDef;
046import cascading.scheme.Scheme;
047import cascading.tuple.Fields;
048import cascading.tuple.FieldsResolverException;
049import cascading.tuple.Tuple;
050import cascading.tuple.TupleEntry;
051import cascading.tuple.TupleEntryCollector;
052import cascading.tuple.TupleEntryIterator;
053import cascading.util.TraceUtil;
054import cascading.util.Traceable;
055import cascading.util.Util;
056
057/**
058 * A Tap represents the physical data source or sink in a connected {@link cascading.flow.Flow}.
059 * <p>
060 * That is, a source Tap is the head end of a connected {@link Pipe} and {@link Tuple} stream, and
061 * a sink Tap is the tail end. Kinds of Tap types are used to manage files from a local disk,
062 * distributed disk, remote storage like Amazon S3, or via FTP. It simply abstracts
063 * out the complexity of connecting to these types of data sources.
064 * <p>
065 * A Tap takes a {@link Scheme} instance, which is used to identify the type of resource (text file, binary file, etc).
066 * A Tap is responsible for how the resource is reached.
067 * <p>
068 * By default when planning a Flow, Tap equality is a function of the {@link #getIdentifier()} and {@link #getScheme()}
069 * values. That is, two Tap instances are the same Tap instance if they sink/source the same resource and sink/source
070 * the same fields.
071 * <p>
072 * Some more advanced taps, like a database tap, may need to extend equality to include any filtering, like the
073 * {@code where} clause in a SQL statement so two taps reading from the same SQL table aren't considered equal.
074 * <p>
075 * Taps are also used to determine dependencies between two or more {@link Flow} instances when used with a
076 * {@link cascading.cascade.Cascade}. In that case the {@link #getFullIdentifier(Object)} value is used and the Scheme
077 * is ignored.
078 */
079public abstract class Tap<Config, Input, Output> implements ScopedElement, FlowElement, Serializable, Traceable
080  {
081  /** Field scheme */
082  private Scheme<Config, Input, Output, ?, ?> scheme;
083
084  /** Field mode */
085  SinkMode sinkMode = SinkMode.KEEP;
086
087  private ConfigDef configDef;
088  private ConfigDef nodeConfigDef;
089  private ConfigDef stepConfigDef;
090
091  /** Field id */
092  private final String id = Util.createUniqueID(); // 3.0 planner relies on this being consistent
093  /** Field trace */
094  private String trace = TraceUtil.captureDebugTrace( this ); // see TraceUtil.setTrace() to override
095
096  /**
097   * Convenience function to make an array of Tap instances.
098   *
099   * @param taps of type Tap
100   * @return Tap array
101   */
102  public static Tap[] taps( Tap... taps )
103    {
104    return taps;
105    }
106
107  /**
108   * Creates and returns a unique ID for the given Tap, this value is cached and may be used to uniquely identify
109   * the Tap instance in properties files etc.
110   * <p>
111   * This value is generally reproducible assuming the Tap identifier and the Scheme source and sink Fields remain consistent.
112   *
113   * @param tap of type Tap
114   * @return of type String
115   */
116  public static synchronized String id( Tap tap )
117    {
118    if( tap instanceof DecoratorTap )
119      return id( ( (DecoratorTap) tap ).getOriginal() );
120
121    return tap.id;
122    }
123
124  protected Tap()
125    {
126    }
127
128  protected Tap( Scheme<Config, Input, Output, ?, ?> scheme )
129    {
130    this.setScheme( scheme );
131    }
132
133  protected Tap( Scheme<Config, Input, Output, ?, ?> scheme, SinkMode sinkMode )
134    {
135    this.setScheme( scheme );
136    this.sinkMode = sinkMode;
137    }
138
139  protected void setScheme( Scheme<Config, Input, Output, ?, ?> scheme )
140    {
141    this.scheme = scheme;
142    }
143
144  /**
145   * Method getScheme returns the scheme of this Tap object.
146   *
147   * @return the scheme (type Scheme) of this Tap object.
148   */
149  public Scheme<Config, Input, Output, ?, ?> getScheme()
150    {
151    return scheme;
152    }
153
154  @Override
155  public String getTrace()
156    {
157    return trace;
158    }
159
160  /**
161   * Method flowInit allows this Tap instance to initialize itself in context of the given {@link cascading.flow.Flow} instance.
162   * This method is guaranteed to be called before the Flow is started and the
163   * {@link cascading.flow.FlowListener#onStarting(cascading.flow.Flow)} event is fired.
164   * <p>
165   * This method will be called once per Flow, and before {@link #sourceConfInit(cascading.flow.FlowProcess, Object)} and
166   * {@link #sinkConfInit(cascading.flow.FlowProcess, Object)} methods.
167   *
168   * @param flow of type Flow
169   */
170  public void flowConfInit( Flow<Config> flow )
171    {
172
173    }
174
175  /**
176   * Method sourceConfInit initializes this instance as a source.
177   * <p>
178   * This method maybe called more than once if this Tap instance is used outside the scope of a {@link cascading.flow.Flow}
179   * instance or if it participates in multiple times in a given Flow or across different Flows in
180   * a {@link cascading.cascade.Cascade}.
181   * <p>
182   * In the context of a Flow, it will be called after
183   * {@link cascading.flow.FlowListener#onStarting(cascading.flow.Flow)}
184   * <p>
185   * Note that no resources or services should be modified by this method.
186   *
187   * @param flowProcess of type FlowProcess
188   * @param conf        of type Config
189   */
190  public void sourceConfInit( FlowProcess<? extends Config> flowProcess, Config conf )
191    {
192    getScheme().sourceConfInit( flowProcess, this, conf );
193    }
194
195  /**
196   * Method sinkConfInit initializes this instance as a sink.
197   * <p>
198   * This method maybe called more than once if this Tap instance is used outside the scope of a {@link cascading.flow.Flow}
199   * instance or if it participates in multiple times in a given Flow or across different Flows in
200   * a {@link cascading.cascade.Cascade}.
201   * <p>
202   * Note this method will be called in context of this Tap being used as a traditional 'sink' and as a 'trap'.
203   * <p>
204   * In the context of a Flow, it will be called after
205   * {@link cascading.flow.FlowListener#onStarting(cascading.flow.Flow)}
206   * <p>
207   * Note that no resources or services should be modified by this method. If this Tap instance returns true for
208   * {@link #isReplace()}, then {@link #deleteResource(Object)} will be called by the parent Flow.
209   *
210   * @param flowProcess of type FlowProcess
211   * @param conf        of type Config
212   */
213  public void sinkConfInit( FlowProcess<? extends Config> flowProcess, Config conf )
214    {
215    getScheme().sinkConfInit( flowProcess, this, conf );
216    }
217
218  /**
219   * Method getIdentifier returns a String representing the resource this Tap instance represents.
220   * <p>
221   * Often, if the tap accesses a filesystem, the identifier is nothing more than the path to the file or directory.
222   * In other cases it may be a an URL or URI representing a connection string or remote resource.
223   * <p>
224   * Any two Tap instances having the same value for the identifier are considered equal.
225   *
226   * @return String
227   */
228  @Property(name = "identifier", visibility = Visibility.PUBLIC)
229  @PropertyDescription("The resource this instance represents")
230  @PropertySanitizer("cascading.management.annotation.URISanitizer")
231  public abstract String getIdentifier();
232
233  /**
234   * Method getSourceFields returns the sourceFields of this Tap object.
235   *
236   * @return the sourceFields (type Fields) of this Tap object.
237   */
238  public Fields getSourceFields()
239    {
240    return getScheme().getSourceFields();
241    }
242
243  /**
244   * Method getSinkFields returns the sinkFields of this Tap object.
245   *
246   * @return the sinkFields (type Fields) of this Tap object.
247   */
248  public Fields getSinkFields()
249    {
250    return getScheme().getSinkFields();
251    }
252
253  /**
254   * Method openForRead opens the resource represented by this Tap instance for reading.
255   * <p>
256   * {@code input} value may be null, if so, sub-classes must inquire with the underlying {@link Scheme}
257   * via {@link Scheme#sourceConfInit(cascading.flow.FlowProcess, Tap, Object)} to get the proper
258   * input type and instantiate it before calling {@code super.openForRead()}.
259   * <p>
260   * Note the returned iterator will return the same instance of {@link cascading.tuple.TupleEntry} on every call,
261   * thus a copy must be made of either the TupleEntry or the underlying {@code Tuple} instance if they are to be
262   * stored in a Collection.
263   *
264   * @param flowProcess of type FlowProcess
265   * @param input       of type Input
266   * @return TupleEntryIterator
267   * @throws java.io.IOException when the resource cannot be opened
268   */
269  public abstract TupleEntryIterator openForRead( FlowProcess<? extends Config> flowProcess, Input input ) throws IOException;
270
271  /**
272   * Method openForRead opens the resource represented by this Tap instance for reading.
273   * <p>
274   * Note the returned iterator will return the same instance of {@link cascading.tuple.TupleEntry} on every call,
275   * thus a copy must be made of either the TupleEntry or the underlying {@code Tuple} instance if they are to be
276   * stored in a Collection.
277   *
278   * @param flowProcess of type FlowProcess
279   * @return TupleEntryIterator
280   * @throws java.io.IOException when the resource cannot be opened
281   */
282  public TupleEntryIterator openForRead( FlowProcess<? extends Config> flowProcess ) throws IOException
283    {
284    return openForRead( flowProcess, null );
285    }
286
287  /**
288   * Method openForWrite opens the resource represented by this Tap instance for writing.
289   * <p>
290   * This method is used internally and does not honor the {@link SinkMode} setting. If SinkMode is
291   * {@link SinkMode#REPLACE}, this call may fail. See {@link #openForWrite(cascading.flow.FlowProcess)}.
292   * <p>
293   * {@code output} value may be null, if so, sub-classes must inquire with the underlying {@link Scheme}
294   * via {@link Scheme#sinkConfInit(cascading.flow.FlowProcess, Tap, Object)} to get the proper
295   * output type and instantiate it before calling {@code super.openForWrite()}.
296   *
297   * @param flowProcess of type FlowProcess
298   * @param output      of type Output
299   * @return TupleEntryCollector
300   * @throws java.io.IOException when the resource cannot be opened
301   */
302  public abstract TupleEntryCollector openForWrite( FlowProcess<? extends Config> flowProcess, Output output ) throws IOException;
303
304  /**
305   * Method openForWrite opens the resource represented by this Tap instance for writing.
306   * <p>
307   * This method is for user application use and does honor the {@link SinkMode#REPLACE} settings. That is, if
308   * SinkMode is set to {@link SinkMode#REPLACE} the underlying resource will be deleted.
309   * <p>
310   * Note if {@link SinkMode#UPDATE} is set, the resource will not be deleted.
311   *
312   * @param flowProcess of type FlowProcess
313   * @return TupleEntryCollector
314   * @throws java.io.IOException when the resource cannot be opened
315   */
316  public TupleEntryCollector openForWrite( FlowProcess<? extends Config> flowProcess ) throws IOException
317    {
318    if( isReplace() )
319      deleteResource( flowProcess );
320
321    return openForWrite( flowProcess, null );
322    }
323
324  @Override
325  public Scope outgoingScopeFor( Set<Scope> incomingScopes )
326    {
327    // as a source Tap, we emit the scheme defined Fields
328    // as a sink Tap, we declare we emit the incoming Fields
329    // as a temp Tap, this method never gets called, but we emit what we consume
330    int count = 0;
331    for( Scope incomingScope : incomingScopes )
332      {
333      Fields incomingFields = incomingScope.getIncomingTapFields();
334
335      if( incomingFields != null )
336        {
337        try
338          {
339          incomingFields.select( getSinkFields() );
340          }
341        catch( FieldsResolverException exception )
342          {
343          throw new TapException( this, exception.getSourceFields(), exception.getSelectorFields(), exception );
344          }
345
346        count++;
347        }
348      }
349
350    if( count > 1 )
351      throw new FlowException( "Tap may not have more than one incoming Scope" );
352
353    // this allows the incoming to be passed through to the outgoing
354    Fields incomingFields = incomingScopes.size() == 0 ? null : incomingScopes.iterator().next().getIncomingTapFields();
355
356    if( incomingFields != null &&
357      ( isSource() && getSourceFields().equals( Fields.UNKNOWN ) ||
358        isSink() && getSinkFields().equals( Fields.ALL ) ) )
359      return new Scope( incomingFields );
360
361    if( count == 1 )
362      return new Scope( getSinkFields() );
363
364    return new Scope( getSourceFields() );
365    }
366
367  /**
368   * A hook for allowing a Scheme to lazily retrieve its source fields.
369   *
370   * @param flowProcess of type FlowProcess
371   * @return the found Fields
372   */
373  public Fields retrieveSourceFields( FlowProcess<? extends Config> flowProcess )
374    {
375    return getScheme().retrieveSourceFields( flowProcess, this );
376    }
377
378  public void presentSourceFields( FlowProcess<? extends Config> flowProcess, Fields fields )
379    {
380    getScheme().presentSourceFields( flowProcess, this, fields );
381    }
382
383  /**
384   * A hook for allowing a Scheme to lazily retrieve its sink fields.
385   *
386   * @param flowProcess of type FlowProcess
387   * @return the found Fields
388   */
389  public Fields retrieveSinkFields( FlowProcess<? extends Config> flowProcess )
390    {
391    return getScheme().retrieveSinkFields( flowProcess, this );
392    }
393
394  public void presentSinkFields( FlowProcess<? extends Config> flowProcess, Fields fields )
395    {
396    getScheme().presentSinkFields( flowProcess, this, fields );
397    }
398
399  @Override
400  public Fields resolveIncomingOperationArgumentFields( Scope incomingScope )
401    {
402    return incomingScope.getIncomingTapFields();
403    }
404
405  @Override
406  public Fields resolveIncomingOperationPassThroughFields( Scope incomingScope )
407    {
408    return incomingScope.getIncomingTapFields();
409    }
410
411  /**
412   * Method getFullIdentifier returns a fully qualified resource identifier.
413   *
414   * @param flowProcess of type FlowProcess
415   * @return String
416   */
417  public String getFullIdentifier( FlowProcess<? extends Config> flowProcess )
418    {
419    return getFullIdentifier( flowProcess.getConfig() );
420    }
421
422  /**
423   * Method getFullIdentifier returns a fully qualified resource identifier.
424   *
425   * @param conf of type Config
426   * @return String
427   */
428  public String getFullIdentifier( Config conf )
429    {
430    return getIdentifier();
431    }
432
433  /**
434   * Method createResource creates the underlying resource.
435   *
436   * @param flowProcess of type FlowProcess
437   * @return boolean
438   * @throws IOException when there is an error making directories
439   */
440  public boolean createResource( FlowProcess<? extends Config> flowProcess ) throws IOException
441    {
442    return createResource( flowProcess.getConfig() );
443    }
444
445  /**
446   * Method createResource creates the underlying resource.
447   *
448   * @param conf of type Config
449   * @return boolean
450   * @throws IOException when there is an error making directories
451   */
452  public abstract boolean createResource( Config conf ) throws IOException;
453
454  /**
455   * Method deleteResource deletes the resource represented by this instance.
456   *
457   * @param flowProcess of type FlowProcess
458   * @return boolean
459   * @throws IOException when the resource cannot be deleted
460   */
461  public boolean deleteResource( FlowProcess<? extends Config> flowProcess ) throws IOException
462    {
463    return deleteResource( flowProcess.getConfig() );
464    }
465
466  /**
467   * Method deleteResource deletes the resource represented by this instance.
468   *
469   * @param conf of type Config
470   * @return boolean
471   * @throws IOException when the resource cannot be deleted
472   */
473  public abstract boolean deleteResource( Config conf ) throws IOException;
474
475  /**
476   * Method prepareResourceForRead allows the underlying resource to be notified when reading will begin.
477   * <p>
478   * This method will be called client side so that any remote or external resources can be initialized.
479   * <p>
480   * If this method returns {@code false}, an exception will be thrown halting the current Flow.
481   * <p>
482   * In most cases, resource initialization should happen in the {@link #openForRead(FlowProcess, Object)}  method.
483   * <p>
484   * This allows for initialization of cluster side resources, like a JDBC driver used to read data from a database,
485   * that cannot be passed client to cluster.
486   *
487   * @param conf of type Config
488   * @return returns true if successful
489   * @throws IOException
490   */
491  public boolean prepareResourceForRead( Config conf ) throws IOException
492    {
493    return true;
494    }
495
496  /**
497   * Method prepareResourceForWrite allows the underlying resource to be notified when writing will begin.
498   * <p>
499   * This method will be called once client side so that any remote or external resources can be initialized.
500   * <p>
501   * If this method returns {@code false}, an exception will be thrown halting the current Flow.
502   * <p>
503   * In most cases, resource initialization should happen in the {@link #openForWrite(FlowProcess, Object)} method.
504   * <p>
505   * This allows for initialization of cluster side resources, like a JDBC driver used to write data to a database,
506   * that cannot be passed client to cluster.
507   * <p>
508   * In the above JDBC example, overriding this method will allow for testing for the existence of and/or creating
509   * a remote table used by all individual cluster side tasks.
510   *
511   * @param conf of type Config
512   * @return returns true if successful
513   * @throws IOException
514   */
515  public boolean prepareResourceForWrite( Config conf ) throws IOException
516    {
517    return true;
518    }
519
520  /**
521   * Method commitResource allows the underlying resource to be notified when all write processing is
522   * successful so that any additional cleanup or processing may be completed.
523   * <p>
524   * See {@link #rollbackResource(Object)} to handle cleanup in the face of failures.
525   * <p>
526   * This method is invoked once client side and not in the cluster, if any.
527   * <p>
528   * If other sink Tap instance in a given Flow fail on commitResource after called on this instance,
529   * rollbackResource will not be called.
530   *
531   * @param conf of type Config
532   * @return returns true if successful
533   * @throws IOException
534   */
535  public boolean commitResource( Config conf ) throws IOException
536    {
537    return true;
538    }
539
540  /**
541   * Method rollbackResource allows the underlying resource to be notified when any write processing has failed or
542   * was stopped so that any cleanup may be started.
543   * <p>
544   * See {@link #commitResource(Object)} to handle cleanup when the write has successfully completed.
545   * <p>
546   * This method is invoked once client side and not in the cluster, if any.
547   *
548   * @param conf of type Config
549   * @return returns true if successful
550   * @throws IOException
551   */
552  public boolean rollbackResource( Config conf ) throws IOException
553    {
554    return true;
555    }
556
557  /**
558   * Method resourceExists returns true if the path represented by this instance exists.
559   *
560   * @param flowProcess of type FlowProcess
561   * @return true if the underlying resource already exists
562   * @throws IOException when the status cannot be determined
563   */
564  public boolean resourceExists( FlowProcess<? extends Config> flowProcess ) throws IOException
565    {
566    return resourceExists( flowProcess.getConfig() );
567    }
568
569  /**
570   * Method resourceExists returns true if the path represented by this instance exists.
571   *
572   * @param conf of type Config
573   * @return true if the underlying resource already exists
574   * @throws IOException when the status cannot be determined
575   */
576  public abstract boolean resourceExists( Config conf ) throws IOException;
577
578  /**
579   * Method getModifiedTime returns the date this resource was last modified.
580   * <p>
581   * If the resource does not exist, returns zero (0).
582   * <p>
583   * If the resource is continuous, returns {@link Long#MAX_VALUE}.
584   *
585   * @param flowProcess of type FlowProcess
586   * @return The date this resource was last modified.
587   * @throws IOException
588   */
589  public long getModifiedTime( FlowProcess<? extends Config> flowProcess ) throws IOException
590    {
591    return getModifiedTime( flowProcess.getConfig() );
592    }
593
594  /**
595   * Method getModifiedTime returns the date this resource was last modified.
596   * <p>
597   * If the resource does not exist, returns zero (0).
598   * <p>
599   * If the resource is continuous, returns {@link Long#MAX_VALUE}.
600   *
601   * @param conf of type Config
602   * @return The date this resource was last modified.
603   * @throws IOException
604   */
605  public abstract long getModifiedTime( Config conf ) throws IOException;
606
607  /**
608   * Method getSinkMode returns the {@link SinkMode} }of this Tap object.
609   *
610   * @return the sinkMode (type SinkMode) of this Tap object.
611   */
612  public SinkMode getSinkMode()
613    {
614    return sinkMode;
615    }
616
617  /**
618   * Method isKeep indicates whether the resource represented by this instance should be kept if it
619   * already exists when the Flow is started.
620   *
621   * @return boolean
622   */
623  public boolean isKeep()
624    {
625    return sinkMode == SinkMode.KEEP;
626    }
627
628  /**
629   * Method isReplace indicates whether the resource represented by this instance should be deleted if it
630   * already exists when the Flow is started.
631   *
632   * @return boolean
633   */
634  public boolean isReplace()
635    {
636    return sinkMode == SinkMode.REPLACE;
637    }
638
639  /**
640   * Method isUpdate indicates whether the resource represented by this instance should be updated if it already
641   * exists. Otherwise a new resource will be created, via {@link #createResource(Object)}, when the Flow is started.
642   *
643   * @return boolean
644   */
645  public boolean isUpdate()
646    {
647    return sinkMode == SinkMode.UPDATE;
648    }
649
650  /**
651   * Method isSink returns true if this Tap instance can be used as a sink.
652   *
653   * @return boolean
654   */
655  public boolean isSink()
656    {
657    return getScheme().isSink();
658    }
659
660  /**
661   * Method isSource returns true if this Tap instance can be used as a source.
662   *
663   * @return boolean
664   */
665  public boolean isSource()
666    {
667    return getScheme().isSource();
668    }
669
670  /**
671   * Method isTemporary returns true if this Tap is temporary (used for intermediate results).
672   *
673   * @return the temporary (type boolean) of this Tap object.
674   */
675  public boolean isTemporary()
676    {
677    return false;
678    }
679
680  /**
681   * Returns a {@link cascading.property.ConfigDef} instance that allows for local properties to be set and made available via
682   * a resulting {@link cascading.flow.FlowProcess} instance when the tap is invoked.
683   * <p>
684   * Any properties set on the configDef will not show up in any {@link Flow} or {@link cascading.flow.FlowStep} process
685   * level configuration, but will override any of those values as seen by the current Tap instance method call where a
686   * FlowProcess is provided except for the {@link #sourceConfInit(cascading.flow.FlowProcess, Object)} and
687   * {@link #sinkConfInit(cascading.flow.FlowProcess, Object)} methods.
688   * <p>
689   * That is, the {@code *confInit} methods are called before any ConfigDef is applied, so any values placed into
690   * a ConfigDef instance will not be visible to them.
691   *
692   * @return an instance of ConfigDef
693   */
694  public ConfigDef getConfigDef()
695    {
696    if( configDef == null )
697      configDef = new ConfigDef();
698
699    return configDef;
700    }
701
702  /**
703   * Returns {@code true} if there are properties in the configDef instance.
704   *
705   * @return true if there are configDef properties
706   */
707  public boolean hasConfigDef()
708    {
709    return configDef != null && !configDef.isEmpty();
710    }
711
712  /**
713   * Returns a {@link ConfigDef} instance that allows for process level properties to be set and made available via
714   * a resulting {@link cascading.flow.FlowProcess} instance when the tap is invoked.
715   * <p>
716   * Any properties set on the nodeConfigDef will not show up in any Flow configuration, but will show up in
717   * the current process {@link cascading.flow.FlowNode} (in Apache Tez the Vertex configuration). Any value set in the
718   * nodeConfigDef will be overridden by the pipe local {@code #getConfigDef} instance.
719   * <p>
720   * Use this method to tweak properties in the process node this tap instance is planned into.
721   *
722   * @return an instance of ConfigDef
723   */
724  @Override
725  public ConfigDef getNodeConfigDef()
726    {
727    if( nodeConfigDef == null )
728      nodeConfigDef = new ConfigDef();
729
730    return nodeConfigDef;
731    }
732
733  /**
734   * Returns {@code true} if there are properties in the nodeConfigDef instance.
735   *
736   * @return true if there are nodeConfigDef properties
737   */
738  @Override
739  public boolean hasNodeConfigDef()
740    {
741    return nodeConfigDef != null && !nodeConfigDef.isEmpty();
742    }
743
744  /**
745   * Returns a {@link ConfigDef} instance that allows for process level properties to be set and made available via
746   * a resulting {@link cascading.flow.FlowProcess} instance when the tap is invoked.
747   * <p>
748   * Any properties set on the stepConfigDef will not show up in any Flow configuration, but will show up in
749   * the current process {@link cascading.flow.FlowStep} (in Hadoop the MapReduce jobconf). Any value set in the
750   * stepConfigDef will be overridden by the tap local {@code #getConfigDef} instance.
751   * <p>
752   * Use this method to tweak properties in the process step this tap instance is planned into.
753   * <p>
754   * Note the {@code *confInit} methods are called before any ConfigDef is applied, so any values placed into
755   * a ConfigDef instance will not be visible to them.
756   *
757   * @return an instance of ConfigDef
758   */
759  @Override
760  public ConfigDef getStepConfigDef()
761    {
762    if( stepConfigDef == null )
763      stepConfigDef = new ConfigDef();
764
765    return stepConfigDef;
766    }
767
768  /**
769   * Returns {@code true} if there are properties in the stepConfigDef instance.
770   *
771   * @return true if there are stepConfigDef properties
772   */
773  @Override
774  public boolean hasStepConfigDef()
775    {
776    return stepConfigDef != null && !stepConfigDef.isEmpty();
777    }
778
779  public Spliterator<TupleEntry> spliterator( FlowProcess<? extends Config> flowProcess )
780    {
781    return splititerator( openForReadUnchecked( flowProcess ) );
782    }
783
784  protected TupleEntryIterator openForReadUnchecked( FlowProcess<? extends Config> flowProcess )
785    {
786    try
787      {
788      return openForRead( flowProcess );
789      }
790    catch( IOException exception )
791      {
792      throw new UncheckedIOException( exception );
793      }
794    }
795
796  protected Spliterator<TupleEntry> splititerator( TupleEntryIterator iterator )
797    {
798    return Spliterators.spliteratorUnknownSize( iterator, 0 );
799    }
800
801  /**
802   * Method entryStream returns a {@link Stream} of {@link TupleEntry} instances from the given
803   * Tap instance.
804   * <p>
805   * Also see {@link cascading.tuple.TupleEntryStream#entryStream(Tap, FlowProcess)}.
806   * <p>
807   * Note, the returned Stream instance must be closed in order to clean up underlying resources. This
808   * is simply accomplished with a try-with-resources statement.
809   *
810   * @param flowProcess represents the current platform configuration
811   * @return a Stream of TupleEntry instances
812   */
813  public Stream<TupleEntry> entryStream( FlowProcess<? extends Config> flowProcess )
814    {
815    TupleEntryIterator iterator = openForReadUnchecked( flowProcess );
816    Spliterator<TupleEntry> spliterator = splititerator( iterator );
817
818    try
819      {
820      return StreamSupport
821        .stream( spliterator, false )
822        .onClose( asUncheckedRunnable( iterator ) );
823      }
824    catch( Error | RuntimeException error )
825      {
826      try
827        {
828        iterator.close();
829        }
830      catch( IOException exception )
831        {
832        try
833          {
834          error.addSuppressed( exception );
835          }
836        catch( Throwable ignore ){}
837        }
838
839      throw error;
840      }
841    }
842
843  /**
844   * Method entryStreamCopy returns a {@link Stream} of {@link TupleEntry} instances from the given
845   * Tap instance.
846   * <p>
847   * This method returns an TupleEntry instance suitable for caching.
848   * <p>
849   * Also see {@link cascading.tuple.TupleEntryStream#entryStreamCopy(Tap, FlowProcess)}.
850   * <p>
851   * Note, the returned Stream instance must be closed in order to clean up underlying resources. This
852   * is simply accomplished with a try-with-resources statement.
853   *
854   * @param flowProcess represents the current platform configuration
855   * @return a Stream of TupleEntry instances
856   */
857  public Stream<TupleEntry> entryStreamCopy( FlowProcess<? extends Config> flowProcess )
858    {
859    return entryStream( flowProcess ).map( TupleEntry::new );
860    }
861
862  /**
863   * Method entryStream returns a {@link Stream} of {@link TupleEntry} instances from the given
864   * Tap instance.
865   * <p>
866   * Also see {@link cascading.tuple.TupleEntryStream#entryStream(Tap, FlowProcess, Fields)}.
867   * <p>
868   * Note, the returned Stream instance must be closed in order to clean up underlying resources. This
869   * is simply accomplished with a try-with-resources statement.
870   *
871   * @param flowProcess represents the current platform configuration
872   * @param selector    the fields to select from the underlying TupleEntry
873   * @return a Stream of TupleEntry instances
874   */
875  public Stream<TupleEntry> entryStream( FlowProcess<? extends Config> flowProcess, Fields selector )
876    {
877    return entryStream( flowProcess ).map( tupleEntry -> tupleEntry.selectEntry( selector ) );
878    }
879
880  /**
881   * Method entryStreamCopy returns a {@link Stream} of {@link TupleEntry} instances from the given
882   * Tap instance.
883   * <p>
884   * Also see {@link cascading.tuple.TupleEntryStream#entryStreamCopy(Tap, FlowProcess)}.
885   * <p>
886   * Note, the returned Stream instance must be closed in order to clean up underlying resources. This
887   * is simply accomplished with a try-with-resources statement.
888   *
889   * @param flowProcess represents the current platform configuration
890   * @param selector    the fields to select from the underlying TupleEntry
891   * @return a Stream of TupleEntry instances
892   */
893  public Stream<TupleEntry> entryStreamCopy( FlowProcess<? extends Config> flowProcess, Fields selector )
894    {
895    return entryStream( flowProcess ).map( tupleEntry -> tupleEntry.selectEntryCopy( selector ) );
896    }
897
898  /**
899   * Method tupleStream returns a {@link Stream} of {@link Tuple} instances from the given
900   * Tap instance.
901   * <p>
902   * Also see {@link cascading.tuple.TupleStream#tupleStream(Tap, FlowProcess)}.
903   *
904   * @param flowProcess represents the current platform configuration
905   * @return a Stream of Tuple instances
906   */
907  public Stream<Tuple> tupleStream( FlowProcess<? extends Config> flowProcess )
908    {
909    return entryStream( flowProcess ).map( TupleEntry::getTuple );
910    }
911
912  /**
913   * Method tupleStreamCopy returns a {@link Stream} of {@link Tuple} instances from the given
914   * Tap instance.
915   * <p>
916   * This method returns an Tuple instance suitable for caching.
917   * <p>
918   * Also see {@link cascading.tuple.TupleStream#tupleStreamCopy(Tap, FlowProcess)}.
919   *
920   * @param flowProcess represents the current platform configuration
921   * @return a Stream of Tuple instances
922   */
923  public Stream<Tuple> tupleStreamCopy( FlowProcess<? extends Config> flowProcess )
924    {
925    return entryStream( flowProcess ).map( TupleEntry::getTupleCopy );
926    }
927
928  /**
929   * Method tupleStream returns a {@link Stream} of {@link Tuple} instances from the given
930   * Tap instance.
931   * <p>
932   * Also see {@link cascading.tuple.TupleStream#tupleStream(Tap, FlowProcess, Fields)}.
933   *
934   * @param flowProcess represents the current platform configuration
935   * @param selector    the fields to select from the underlying Tuple
936   * @return a Stream of TupleE instances
937   */
938  public Stream<Tuple> tupleStream( FlowProcess<? extends Config> flowProcess, Fields selector )
939    {
940    return entryStream( flowProcess ).map( tupleEntry -> tupleEntry.selectTuple( selector ) );
941    }
942
943  /**
944   * Method tupleStreamCopy returns a {@link Stream} of {@link Tuple} instances from the given
945   * Tap instance.
946   * <p>
947   * This method returns an Tuple instance suitable for caching.
948   * <p>
949   * Also see {@link cascading.tuple.TupleStream#tupleStreamCopy(Tap, FlowProcess)}.
950   *
951   * @param flowProcess represents the current platform configuration
952   * @param selector    the fields to select from the underlying Tuple
953   * @return a Stream of TupleE instances
954   */
955  public Stream<Tuple> tupleStreamCopy( FlowProcess<? extends Config> flowProcess, Fields selector )
956    {
957    return entryStream( flowProcess ).map( tupleEntry -> tupleEntry.selectTupleCopy( selector ) );
958    }
959
960  private static Runnable asUncheckedRunnable( Closeable closeable )
961    {
962    return () ->
963    {
964    try
965      {
966      closeable.close();
967      }
968    catch( IOException exception )
969      {
970      throw new UncheckedIOException( exception );
971      }
972    };
973    }
974
975  @Override
976  public boolean equals( Object object )
977    {
978    if( this == object )
979      return true;
980    if( object == null || getClass() != object.getClass() )
981      return false;
982
983    Tap tap = (Tap) object;
984
985    if( getIdentifier() != null ? !getIdentifier().equals( tap.getIdentifier() ) : tap.getIdentifier() != null )
986      return false;
987
988    if( getScheme() != null ? !getScheme().equals( tap.getScheme() ) : tap.getScheme() != null )
989      return false;
990
991    return true;
992    }
993
994  @Override
995  public int hashCode()
996    {
997    int result = getIdentifier() != null ? getIdentifier().hashCode() : 0;
998
999    result = 31 * result + ( getScheme() != null ? getScheme().hashCode() : 0 );
1000
1001    return result;
1002    }
1003
1004  @Override
1005  public String toString()
1006    {
1007    if( getIdentifier() != null )
1008      return getClass().getSimpleName() + "[\"" + getScheme() + "\"]" + "[\"" + Util.sanitizeUrl( getIdentifier() ) + "\"]"; // sanitize
1009    else
1010      return getClass().getSimpleName() + "[\"" + getScheme() + "\"]" + "[not initialized]";
1011    }
1012  }