001/*
002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.flow;
023
024import java.util.Collection;
025import java.util.Collections;
026import java.util.HashMap;
027import java.util.HashSet;
028import java.util.Map;
029import java.util.Properties;
030import java.util.Set;
031
032import cascading.CascadingException;
033import cascading.flow.planner.FlowPlanner;
034import cascading.flow.planner.PlatformInfo;
035import cascading.flow.planner.rule.RuleRegistrySet;
036import cascading.pipe.Pipe;
037import cascading.property.AppProps;
038import cascading.property.PropertyUtil;
039import cascading.scheme.Scheme;
040import cascading.tap.Tap;
041import cascading.util.Util;
042
043import static cascading.flow.FlowDef.flowDef;
044
045/**
046 * Class FlowConnector is the base class for all platform planners.
047 * <p>
048 * See the {@link FlowDef} class for a fluent way to define a new Flow.
049 * <p>
050 * Use the FlowConnector to link source and sink {@link Tap} instances with an assembly of {@link Pipe} instances into
051 * an executable {@link cascading.flow.Flow}.
052 * <p>
053 * FlowConnector invokes a planner for the target execution environment.
054 * <p>
055 * For executing Flows in local memory against local files, see {@link cascading.flow.local.LocalFlowConnector}.
056 * <p>
057 * For Apache Hadoop, see the {@link cascading.flow.hadoop.Hadoop2MR1FlowConnector}.
058 * Or if you have a pre-existing custom Hadoop job to execute, see {@link cascading.flow.hadoop.MapReduceFlow}, which
059 * doesn't require a planner.
060 * <p>
061 * Note that all {@code connect} methods take a single {@code tail} or an array of {@code tail} Pipe instances. "tail"
062 * refers to the last connected Pipe instances in a pipe-assembly. Pipe-assemblies are graphs of object with "heads"
063 * and "tails". From a given "tail", all connected heads can be found, but not the reverse. So "tails" must be
064 * supplied by the user.
065 * <p>
066 * The FlowConnector and the underlying execution framework (Hadoop or local mode) can be configured via a
067 * {@link Map} or {@link Properties} instance given to the constructor.
068 * <p>
069 * This properties map must be populated before constructing a FlowConnector instance. Many planner specific
070 * properties can be set through the {@link FlowConnectorProps} fluent interface.
071 * <p>
072 * Some planners have required properties. Hadoop expects {@link AppProps#setApplicationJarPath(java.util.Map, String)} or
073 * {@link AppProps#setApplicationJarClass(java.util.Map, Class)} to be set.
074 * <p>
075 * Any properties set and passed through the FlowConnector constructor will be global to all Flow instances created through
076 * the that FlowConnector instance. Some properties are on the {@link FlowDef} and would only be applicable to the
077 * resulting Flow instance.
078 * <p>
079 * These properties are used to influence the current planner and are also passed down to the
080 * execution framework to override any default values. For example when using the Hadoop planner, the number of reducers
081 * or mappers can be set by using platform specific properties.
082 * <p>
083 * Custom operations (Functions, Filter, etc) may also retrieve these property values at runtime through calls to
084 * {@link cascading.flow.FlowProcess#getProperty(String)} or {@link FlowProcess#getStringProperty(String)}.
085 * <p>
086 * Most applications will need to call {@link cascading.property.AppProps#setApplicationJarClass(java.util.Map, Class)} or
087 * {@link cascading.property.AppProps#setApplicationJarPath(java.util.Map, String)} so that
088 * the correct application jar file is passed through to all child processes. The Class or path must reference
089 * the custom application jar, not a Cascading library class or jar. The easiest thing to do is give setApplicationJarClass
090 * the Class with your static main function and let Cascading figure out which jar to use.
091 * <p>
092 * Note that {@code Map<Object,Object> }is compatible with the {@link Properties} class, so properties can be loaded at
093 * runtime from a configuration file.
094 * <p>
095 * By default, all {@link cascading.operation.Assertion}s are planned into the resulting Flow instance. This can be
096 * changed for a given Flow by calling {@link FlowDef#setAssertionLevel(cascading.operation.AssertionLevel)} or globally
097 * via {@link FlowConnectorProps#setAssertionLevel(cascading.operation.AssertionLevel)}.
098 * <p>
099 * Also by default, all {@link cascading.operation.Debug}s are planned into the resulting Flow instance. This can be
100 * changed for a given flow by calling {@link FlowDef#setDebugLevel(cascading.operation.DebugLevel)} or globally via
101 * {@link FlowConnectorProps#setDebugLevel(cascading.operation.DebugLevel)}.
102 * <p>
103 * As of version 3.0, custom {@link cascading.flow.planner.rule.RuleRegistry} instances can be provided to customize
104 * a given planner.
105 *
106 * @see cascading.flow.local.LocalFlowConnector
107 * @see cascading.flow.hadoop2.Hadoop2MR1FlowConnector
108 * @see cascading.flow.tez.Hadoop2TezFlowConnector
109 */
110public abstract class FlowConnector
111  {
112  /** Field properties */
113  protected Map<Object, Object> properties; // may be a Map or Properties instance. see PropertyUtil
114
115  private RuleRegistrySet ruleRegistrySet;
116
117  /**
118   * Method getIntermediateSchemeClass is used for debugging.
119   *
120   * @param properties of type Map
121   * @return Class
122   */
123  public Class getIntermediateSchemeClass( Map<Object, Object> properties )
124    {
125    // supporting stuffed classes to overcome classloading issue
126    Object type = PropertyUtil.getProperty( properties, FlowConnectorProps.INTERMEDIATE_SCHEME_CLASS, null );
127
128    if( type == null )
129      return getDefaultIntermediateSchemeClass();
130
131    if( type instanceof Class )
132      return (Class) type;
133
134    try
135      {
136      return FlowConnector.class.getClassLoader().loadClass( type.toString() );
137      }
138    catch( ClassNotFoundException exception )
139      {
140      throw new CascadingException( "unable to load class: " + type.toString(), exception );
141      }
142    }
143
144  protected abstract Class<? extends Scheme> getDefaultIntermediateSchemeClass();
145
146  protected FlowConnector()
147    {
148    this.properties = new HashMap<>();
149    }
150
151  protected FlowConnector( RuleRegistrySet ruleRegistrySet )
152    {
153    this();
154    this.ruleRegistrySet = ruleRegistrySet;
155    }
156
157  protected FlowConnector( Map<Object, Object> properties )
158    {
159    if( properties == null )
160      this.properties = new HashMap<>();
161    else if( properties instanceof Properties )
162      this.properties = new Properties( (Properties) properties );
163    else
164      this.properties = new HashMap<>( properties );
165    }
166
167  protected FlowConnector( Map<Object, Object> properties, RuleRegistrySet ruleRegistrySet )
168    {
169    this( properties );
170    this.ruleRegistrySet = ruleRegistrySet;
171    }
172
173  /**
174   * Method getProperties returns the properties of this FlowConnector object. The returned Map instance
175   * is immutable to prevent changes to the underlying property values in this FlowConnector instance.
176   * <p>
177   * If a {@link Properties} instance was passed to the constructor, the returned object will be a flattened
178   * {@link Map} instance.
179   *
180   * @return the properties (type Map) of this FlowConnector object.
181   */
182  public Map<Object, Object> getProperties()
183    {
184    // Sub-classes of FlowConnector should rely on PropertyUtil to manage access to properties objects internally.
185    return Collections.unmodifiableMap( PropertyUtil.asFlatMap( properties ) );
186    }
187
188  /**
189   * Method connect links the given source and sink Taps to the given pipe assembly.
190   *
191   * @param source source Tap to bind to the head of the given tail Pipe
192   * @param sink   sink Tap to bind to the given tail Pipe
193   * @param tail   tail end of a pipe assembly
194   * @return Flow
195   */
196  public Flow connect( Tap source, Tap sink, Pipe tail )
197    {
198    return connect( null, source, sink, tail );
199    }
200
201  /**
202   * Method connect links the given source and sink Taps to the given pipe assembly.
203   *
204   * @param name   name to give the resulting Flow
205   * @param source source Tap to bind to the head of the given tail Pipe
206   * @param sink   sink Tap to bind to the given tail Pipe
207   * @param tail   tail end of a pipe assembly
208   * @return Flow
209   */
210  public Flow connect( String name, Tap source, Tap sink, Pipe tail )
211    {
212    Map<String, Tap> sources = new HashMap<String, Tap>();
213
214    sources.put( tail.getHeads()[ 0 ].getName(), source );
215
216    return connect( name, sources, sink, tail );
217    }
218
219  /**
220   * Method connect links the given source, sink, and trap Taps to the given pipe assembly. The given trap will
221   * be linked to the assembly head along with the source.
222   *
223   * @param name   name to give the resulting Flow
224   * @param source source Tap to bind to the head of the given tail Pipe
225   * @param sink   sink Tap to bind to the given tail Pipe
226   * @param trap   trap Tap to sink all failed Tuples into
227   * @param tail   tail end of a pipe assembly
228   * @return Flow
229   */
230  public Flow connect( String name, Tap source, Tap sink, Tap trap, Pipe tail )
231    {
232    Map<String, Tap> sources = new HashMap<String, Tap>();
233
234    sources.put( tail.getHeads()[ 0 ].getName(), source );
235
236    Map<String, Tap> traps = new HashMap<String, Tap>();
237
238    traps.put( tail.getHeads()[ 0 ].getName(), trap );
239
240    return connect( name, sources, sink, traps, tail );
241    }
242
243  /**
244   * Method connect links the named source Taps and sink Tap to the given pipe assembly.
245   *
246   * @param sources all head names and source Taps to bind to the heads of the given tail Pipe
247   * @param sink    sink Tap to bind to the given tail Pipe
248   * @param tail    tail end of a pipe assembly
249   * @return Flow
250   */
251  public Flow connect( Map<String, Tap> sources, Tap sink, Pipe tail )
252    {
253    return connect( null, sources, sink, tail );
254    }
255
256  /**
257   * Method connect links the named source Taps and sink Tap to the given pipe assembly.
258   *
259   * @param name    name to give the resulting Flow
260   * @param sources all head names and source Taps to bind to the heads of the given tail Pipe
261   * @param sink    sink Tap to bind to the given tail Pipe
262   * @param tail    tail end of a pipe assembly
263   * @return Flow
264   */
265  public Flow connect( String name, Map<String, Tap> sources, Tap sink, Pipe tail )
266    {
267    Map<String, Tap> sinks = new HashMap<String, Tap>();
268
269    sinks.put( tail.getName(), sink );
270
271    return connect( name, sources, sinks, tail );
272    }
273
274  /**
275   * Method connect links the named source and trap Taps and sink Tap to the given pipe assembly.
276   *
277   * @param name    name to give the resulting Flow
278   * @param sources all head names and source Taps to bind to the heads of the given tail Pipe
279   * @param sink    sink Tap to bind to the given tail Pipe
280   * @param traps   all pipe names and trap Taps to sink all failed Tuples into
281   * @param tail    tail end of a pipe assembly
282   * @return Flow
283   */
284  public Flow connect( String name, Map<String, Tap> sources, Tap sink, Map<String, Tap> traps, Pipe tail )
285    {
286    Map<String, Tap> sinks = new HashMap<String, Tap>();
287
288    sinks.put( tail.getName(), sink );
289
290    return connect( name, sources, sinks, traps, tail );
291    }
292
293  /**
294   * Method connect links the named trap Taps, source and sink Tap to the given pipe assembly.
295   *
296   * @param name   name to give the resulting Flow
297   * @param source source Tap to bind to the head of the given tail Pipe
298   * @param sink   sink Tap to bind to the given tail Pipe
299   * @param traps  all pipe names and trap Taps to sink all failed Tuples into
300   * @param tail   tail end of a pipe assembly
301   * @return Flow
302   */
303  public Flow connect( String name, Tap source, Tap sink, Map<String, Tap> traps, Pipe tail )
304    {
305    Map<String, Tap> sources = new HashMap<String, Tap>();
306
307    sources.put( tail.getHeads()[ 0 ].getName(), source );
308
309    Map<String, Tap> sinks = new HashMap<String, Tap>();
310
311    sinks.put( tail.getName(), sink );
312
313    return connect( name, sources, sinks, traps, tail );
314    }
315
316  /**
317   * Method connect links the named source Taps and sink Tap to the given pipe assembly.
318   * <p>
319   * Since only once source Tap is given, it is assumed to be associated with the 'head' pipe.
320   * So the head pipe does not need to be included as an argument.
321   *
322   * @param source source Tap to bind to the head of the given tail Pipes
323   * @param sinks  all tail names and sink Taps to bind to the given tail Pipes
324   * @param tails  all tail ends of a pipe assembly
325   * @return Flow
326   */
327  public Flow connect( Tap source, Map<String, Tap> sinks, Collection<Pipe> tails )
328    {
329    return connect( null, source, sinks, tails.toArray( new Pipe[ tails.size() ] ) );
330    }
331
332  /**
333   * Method connect links the named source Taps and sink Tap to the given pipe assembly.
334   * <p>
335   * Since only once source Tap is given, it is assumed to be associated with the 'head' pipe.
336   * So the head pipe does not need to be included as an argument.
337   *
338   * @param name   name to give the resulting Flow
339   * @param source source Tap to bind to the head of the given tail Pipes
340   * @param sinks  all tail names and sink Taps to bind to the given tail Pipes
341   * @param tails  all tail ends of a pipe assembly
342   * @return Flow
343   */
344  public Flow connect( String name, Tap source, Map<String, Tap> sinks, Collection<Pipe> tails )
345    {
346    return connect( name, source, sinks, tails.toArray( new Pipe[ tails.size() ] ) );
347    }
348
349  /**
350   * Method connect links the named source Taps and sink Tap to the given pipe assembly.
351   * <p>
352   * Since only once source Tap is given, it is assumed to be associated with the 'head' pipe.
353   * So the head pipe does not need to be included as an argument.
354   *
355   * @param source source Tap to bind to the head of the given tail Pipes
356   * @param sinks  all tail names and sink Taps to bind to the given tail Pipes
357   * @param tails  all tail ends of a pipe assembly
358   * @return Flow
359   */
360  public Flow connect( Tap source, Map<String, Tap> sinks, Pipe... tails )
361    {
362    return connect( null, source, sinks, tails );
363    }
364
365  /**
366   * Method connect links the named source Taps and sink Tap to the given pipe assembly.
367   * <p>
368   * Since only once source Tap is given, it is assumed to be associated with the 'head' pipe.
369   * So the head pipe does not need to be included as an argument.
370   *
371   * @param name   name to give the resulting Flow
372   * @param source source Tap to bind to the head of the given tail Pipes
373   * @param sinks  all tail names and sink Taps to bind to the given tail Pipes
374   * @param tails  all tail ends of a pipe assembly
375   * @return Flow
376   */
377  public Flow connect( String name, Tap source, Map<String, Tap> sinks, Pipe... tails )
378    {
379    Set<Pipe> heads = new HashSet<Pipe>();
380
381    for( Pipe pipe : tails )
382      Collections.addAll( heads, pipe.getHeads() );
383
384    if( heads.isEmpty() )
385      throw new IllegalArgumentException( "no pipe instance found" );
386
387    if( heads.size() != 1 )
388      throw new IllegalArgumentException( "there may be only 1 head pipe instance, found " + heads.size() );
389
390    Map<String, Tap> sources = new HashMap<String, Tap>();
391
392    for( Pipe pipe : heads )
393      sources.put( pipe.getName(), source );
394
395    return connect( name, sources, sinks, tails );
396    }
397
398  /**
399   * Method connect links the named sources and sinks to the given pipe assembly.
400   *
401   * @param sources all head names and source Taps to bind to the heads of the given tail Pipes
402   * @param sinks   all tail names and sink Taps to bind to the given tail Pipes
403   * @param tails   all tail ends of a pipe assembly
404   * @return Flow
405   */
406  public Flow connect( Map<String, Tap> sources, Map<String, Tap> sinks, Pipe... tails )
407    {
408    return connect( null, sources, sinks, tails );
409    }
410
411  /**
412   * Method connect links the named sources and sinks to the given pipe assembly.
413   *
414   * @param name    name to give the resulting Flow
415   * @param sources all head names and source Taps to bind to the heads of the given tail Pipes
416   * @param sinks   all tail names and sink Taps to bind to the given tail Pipes
417   * @param tails   all tail ends of a pipe assembly
418   * @return Flow
419   */
420  public Flow connect( String name, Map<String, Tap> sources, Map<String, Tap> sinks, Pipe... tails )
421    {
422    return connect( name, sources, sinks, new HashMap<String, Tap>(), tails );
423    }
424
425  /**
426   * Method connect links the named sources, sinks and traps to the given pipe assembly.
427   *
428   * @param name    name to give the resulting Flow
429   * @param sources all head names and source Taps to bind to the heads of the given tail Pipes
430   * @param sinks   all tail names and sink Taps to bind to the given tail Pipes
431   * @param traps   all pipe names and trap Taps to sink all failed Tuples into
432   * @param tails   all tail ends of a pipe assembly
433   * @return Flow
434   */
435  public Flow connect( String name, Map<String, Tap> sources, Map<String, Tap> sinks, Map<String, Tap> traps, Pipe... tails )
436    {
437    name = name == null ? makeName( tails ) : name;
438
439    FlowDef flowDef = flowDef()
440      .setName( name )
441      .addSources( sources )
442      .addSinks( sinks )
443      .addTraps( traps )
444      .addTails( tails );
445
446    return connect( flowDef );
447    }
448
449  public Flow connect( FlowDef flowDef )
450    {
451    FlowPlanner flowPlanner = createFlowPlanner();
452
453    flowPlanner.initialize( this, properties );
454
455    RuleRegistrySet ruleRegistrySet = getRuleRegistrySet();
456
457    return flowPlanner.buildFlow( flowDef, ruleRegistrySet );
458    }
459
460  protected abstract FlowPlanner createFlowPlanner();
461
462  /**
463   * Returns the configured RuleRegistry, or the default for this platform.
464   * <p>
465   * The registry is mutable, and will be applied to all subsequent planner operations via {@link #connect(FlowDef)}.
466   *
467   * @return the current RuleRegistry instance
468   */
469  public RuleRegistrySet getRuleRegistrySet()
470    {
471    if( ruleRegistrySet != null )
472      return ruleRegistrySet;
473
474    ruleRegistrySet = createDefaultRuleRegistrySet();
475
476    return ruleRegistrySet;
477    }
478
479  protected abstract RuleRegistrySet createDefaultRuleRegistrySet();
480
481  /**
482   * Method getPlatformInfo returns an instance of {@link PlatformInfo} for the underlying platform.
483   *
484   * @return of type PlatformInfo
485   */
486  public PlatformInfo getPlatformInfo()
487    {
488    return createFlowPlanner().getPlatformInfo();
489    }
490
491  /////////
492  // UTIL
493  /////////
494
495  private String makeName( Pipe[] pipes )
496    {
497    String[] names = new String[ pipes.length ];
498
499    for( int i = 0; i < pipes.length; i++ )
500      names[ i ] = pipes[ i ].getName();
501
502    String name = Util.join( names, "+" );
503
504    if( name.length() > 32 )
505      name = name.substring( 0, 32 );
506
507    return name;
508    }
509  }