001/* 002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.flow; 023 024import java.util.Collection; 025import java.util.Collections; 026import java.util.HashMap; 027import java.util.HashSet; 028import java.util.Map; 029import java.util.Properties; 030import java.util.Set; 031 032import cascading.CascadingException; 033import cascading.flow.planner.FlowPlanner; 034import cascading.flow.planner.PlatformInfo; 035import cascading.flow.planner.rule.RuleRegistrySet; 036import cascading.pipe.Pipe; 037import cascading.property.AppProps; 038import cascading.property.PropertyUtil; 039import cascading.scheme.Scheme; 040import cascading.tap.Tap; 041import cascading.util.Util; 042 043import static cascading.flow.FlowDef.flowDef; 044 045/** 046 * Class FlowConnector is the base class for all platform planners. 047 * <p> 048 * See the {@link FlowDef} class for a fluent way to define a new Flow. 049 * <p> 050 * Use the FlowConnector to link source and sink {@link Tap} instances with an assembly of {@link Pipe} instances into 051 * an executable {@link cascading.flow.Flow}. 052 * <p> 053 * FlowConnector invokes a planner for the target execution environment. 054 * <p> 055 * For executing Flows in local memory against local files, see {@link cascading.flow.local.LocalFlowConnector}. 056 * <p> 057 * For Apache Hadoop, see the {@link cascading.flow.hadoop.Hadoop2MR1FlowConnector}. 058 * Or if you have a pre-existing custom Hadoop job to execute, see {@link cascading.flow.hadoop.MapReduceFlow}, which 059 * doesn't require a planner. 060 * <p> 061 * Note that all {@code connect} methods take a single {@code tail} or an array of {@code tail} Pipe instances. "tail" 062 * refers to the last connected Pipe instances in a pipe-assembly. Pipe-assemblies are graphs of object with "heads" 063 * and "tails". From a given "tail", all connected heads can be found, but not the reverse. So "tails" must be 064 * supplied by the user. 065 * <p> 066 * The FlowConnector and the underlying execution framework (Hadoop or local mode) can be configured via a 067 * {@link Map} or {@link Properties} instance given to the constructor. 068 * <p> 069 * This properties map must be populated before constructing a FlowConnector instance. Many planner specific 070 * properties can be set through the {@link FlowConnectorProps} fluent interface. 071 * <p> 072 * Some planners have required properties. Hadoop expects {@link AppProps#setApplicationJarPath(java.util.Map, String)} or 073 * {@link AppProps#setApplicationJarClass(java.util.Map, Class)} to be set. 074 * <p> 075 * Any properties set and passed through the FlowConnector constructor will be global to all Flow instances created through 076 * the that FlowConnector instance. Some properties are on the {@link FlowDef} and would only be applicable to the 077 * resulting Flow instance. 078 * <p> 079 * These properties are used to influence the current planner and are also passed down to the 080 * execution framework to override any default values. For example when using the Hadoop planner, the number of reducers 081 * or mappers can be set by using platform specific properties. 082 * <p> 083 * Custom operations (Functions, Filter, etc) may also retrieve these property values at runtime through calls to 084 * {@link cascading.flow.FlowProcess#getProperty(String)} or {@link FlowProcess#getStringProperty(String)}. 085 * <p> 086 * Most applications will need to call {@link cascading.property.AppProps#setApplicationJarClass(java.util.Map, Class)} or 087 * {@link cascading.property.AppProps#setApplicationJarPath(java.util.Map, String)} so that 088 * the correct application jar file is passed through to all child processes. The Class or path must reference 089 * the custom application jar, not a Cascading library class or jar. The easiest thing to do is give setApplicationJarClass 090 * the Class with your static main function and let Cascading figure out which jar to use. 091 * <p> 092 * Note that {@code Map<Object,Object> }is compatible with the {@link Properties} class, so properties can be loaded at 093 * runtime from a configuration file. 094 * <p> 095 * By default, all {@link cascading.operation.Assertion}s are planned into the resulting Flow instance. This can be 096 * changed for a given Flow by calling {@link FlowDef#setAssertionLevel(cascading.operation.AssertionLevel)} or globally 097 * via {@link FlowConnectorProps#setAssertionLevel(cascading.operation.AssertionLevel)}. 098 * <p> 099 * Also by default, all {@link cascading.operation.Debug}s are planned into the resulting Flow instance. This can be 100 * changed for a given flow by calling {@link FlowDef#setDebugLevel(cascading.operation.DebugLevel)} or globally via 101 * {@link FlowConnectorProps#setDebugLevel(cascading.operation.DebugLevel)}. 102 * <p> 103 * As of version 3.0, custom {@link cascading.flow.planner.rule.RuleRegistry} instances can be provided to customize 104 * a given planner. 105 * 106 * @see cascading.flow.local.LocalFlowConnector 107 * @see cascading.flow.hadoop2.Hadoop2MR1FlowConnector 108 * @see cascading.flow.tez.Hadoop2TezFlowConnector 109 */ 110public abstract class FlowConnector 111 { 112 /** Field properties */ 113 protected Map<Object, Object> properties; // may be a Map or Properties instance. see PropertyUtil 114 115 private RuleRegistrySet ruleRegistrySet; 116 117 /** 118 * Method getIntermediateSchemeClass is used for debugging. 119 * 120 * @param properties of type Map 121 * @return Class 122 */ 123 public Class getIntermediateSchemeClass( Map<Object, Object> properties ) 124 { 125 // supporting stuffed classes to overcome classloading issue 126 Object type = PropertyUtil.getProperty( properties, FlowConnectorProps.INTERMEDIATE_SCHEME_CLASS, null ); 127 128 if( type == null ) 129 return getDefaultIntermediateSchemeClass(); 130 131 if( type instanceof Class ) 132 return (Class) type; 133 134 try 135 { 136 return FlowConnector.class.getClassLoader().loadClass( type.toString() ); 137 } 138 catch( ClassNotFoundException exception ) 139 { 140 throw new CascadingException( "unable to load class: " + type.toString(), exception ); 141 } 142 } 143 144 protected abstract Class<? extends Scheme> getDefaultIntermediateSchemeClass(); 145 146 protected FlowConnector() 147 { 148 this.properties = new HashMap<>(); 149 } 150 151 protected FlowConnector( RuleRegistrySet ruleRegistrySet ) 152 { 153 this(); 154 this.ruleRegistrySet = ruleRegistrySet; 155 } 156 157 protected FlowConnector( Map<Object, Object> properties ) 158 { 159 if( properties == null ) 160 this.properties = new HashMap<>(); 161 else if( properties instanceof Properties ) 162 this.properties = new Properties( (Properties) properties ); 163 else 164 this.properties = new HashMap<>( properties ); 165 } 166 167 protected FlowConnector( Map<Object, Object> properties, RuleRegistrySet ruleRegistrySet ) 168 { 169 this( properties ); 170 this.ruleRegistrySet = ruleRegistrySet; 171 } 172 173 /** 174 * Method getProperties returns the properties of this FlowConnector object. The returned Map instance 175 * is immutable to prevent changes to the underlying property values in this FlowConnector instance. 176 * <p> 177 * If a {@link Properties} instance was passed to the constructor, the returned object will be a flattened 178 * {@link Map} instance. 179 * 180 * @return the properties (type Map) of this FlowConnector object. 181 */ 182 public Map<Object, Object> getProperties() 183 { 184 // Sub-classes of FlowConnector should rely on PropertyUtil to manage access to properties objects internally. 185 return Collections.unmodifiableMap( PropertyUtil.asFlatMap( properties ) ); 186 } 187 188 /** 189 * Method connect links the given source and sink Taps to the given pipe assembly. 190 * 191 * @param source source Tap to bind to the head of the given tail Pipe 192 * @param sink sink Tap to bind to the given tail Pipe 193 * @param tail tail end of a pipe assembly 194 * @return Flow 195 */ 196 public Flow connect( Tap source, Tap sink, Pipe tail ) 197 { 198 return connect( null, source, sink, tail ); 199 } 200 201 /** 202 * Method connect links the given source and sink Taps to the given pipe assembly. 203 * 204 * @param name name to give the resulting Flow 205 * @param source source Tap to bind to the head of the given tail Pipe 206 * @param sink sink Tap to bind to the given tail Pipe 207 * @param tail tail end of a pipe assembly 208 * @return Flow 209 */ 210 public Flow connect( String name, Tap source, Tap sink, Pipe tail ) 211 { 212 Map<String, Tap> sources = new HashMap<String, Tap>(); 213 214 sources.put( tail.getHeads()[ 0 ].getName(), source ); 215 216 return connect( name, sources, sink, tail ); 217 } 218 219 /** 220 * Method connect links the given source, sink, and trap Taps to the given pipe assembly. The given trap will 221 * be linked to the assembly head along with the source. 222 * 223 * @param name name to give the resulting Flow 224 * @param source source Tap to bind to the head of the given tail Pipe 225 * @param sink sink Tap to bind to the given tail Pipe 226 * @param trap trap Tap to sink all failed Tuples into 227 * @param tail tail end of a pipe assembly 228 * @return Flow 229 */ 230 public Flow connect( String name, Tap source, Tap sink, Tap trap, Pipe tail ) 231 { 232 Map<String, Tap> sources = new HashMap<String, Tap>(); 233 234 sources.put( tail.getHeads()[ 0 ].getName(), source ); 235 236 Map<String, Tap> traps = new HashMap<String, Tap>(); 237 238 traps.put( tail.getHeads()[ 0 ].getName(), trap ); 239 240 return connect( name, sources, sink, traps, tail ); 241 } 242 243 /** 244 * Method connect links the named source Taps and sink Tap to the given pipe assembly. 245 * 246 * @param sources all head names and source Taps to bind to the heads of the given tail Pipe 247 * @param sink sink Tap to bind to the given tail Pipe 248 * @param tail tail end of a pipe assembly 249 * @return Flow 250 */ 251 public Flow connect( Map<String, Tap> sources, Tap sink, Pipe tail ) 252 { 253 return connect( null, sources, sink, tail ); 254 } 255 256 /** 257 * Method connect links the named source Taps and sink Tap to the given pipe assembly. 258 * 259 * @param name name to give the resulting Flow 260 * @param sources all head names and source Taps to bind to the heads of the given tail Pipe 261 * @param sink sink Tap to bind to the given tail Pipe 262 * @param tail tail end of a pipe assembly 263 * @return Flow 264 */ 265 public Flow connect( String name, Map<String, Tap> sources, Tap sink, Pipe tail ) 266 { 267 Map<String, Tap> sinks = new HashMap<String, Tap>(); 268 269 sinks.put( tail.getName(), sink ); 270 271 return connect( name, sources, sinks, tail ); 272 } 273 274 /** 275 * Method connect links the named source and trap Taps and sink Tap to the given pipe assembly. 276 * 277 * @param name name to give the resulting Flow 278 * @param sources all head names and source Taps to bind to the heads of the given tail Pipe 279 * @param sink sink Tap to bind to the given tail Pipe 280 * @param traps all pipe names and trap Taps to sink all failed Tuples into 281 * @param tail tail end of a pipe assembly 282 * @return Flow 283 */ 284 public Flow connect( String name, Map<String, Tap> sources, Tap sink, Map<String, Tap> traps, Pipe tail ) 285 { 286 Map<String, Tap> sinks = new HashMap<String, Tap>(); 287 288 sinks.put( tail.getName(), sink ); 289 290 return connect( name, sources, sinks, traps, tail ); 291 } 292 293 /** 294 * Method connect links the named trap Taps, source and sink Tap to the given pipe assembly. 295 * 296 * @param name name to give the resulting Flow 297 * @param source source Tap to bind to the head of the given tail Pipe 298 * @param sink sink Tap to bind to the given tail Pipe 299 * @param traps all pipe names and trap Taps to sink all failed Tuples into 300 * @param tail tail end of a pipe assembly 301 * @return Flow 302 */ 303 public Flow connect( String name, Tap source, Tap sink, Map<String, Tap> traps, Pipe tail ) 304 { 305 Map<String, Tap> sources = new HashMap<String, Tap>(); 306 307 sources.put( tail.getHeads()[ 0 ].getName(), source ); 308 309 Map<String, Tap> sinks = new HashMap<String, Tap>(); 310 311 sinks.put( tail.getName(), sink ); 312 313 return connect( name, sources, sinks, traps, tail ); 314 } 315 316 /** 317 * Method connect links the named source Taps and sink Tap to the given pipe assembly. 318 * <p> 319 * Since only once source Tap is given, it is assumed to be associated with the 'head' pipe. 320 * So the head pipe does not need to be included as an argument. 321 * 322 * @param source source Tap to bind to the head of the given tail Pipes 323 * @param sinks all tail names and sink Taps to bind to the given tail Pipes 324 * @param tails all tail ends of a pipe assembly 325 * @return Flow 326 */ 327 public Flow connect( Tap source, Map<String, Tap> sinks, Collection<Pipe> tails ) 328 { 329 return connect( null, source, sinks, tails.toArray( new Pipe[ tails.size() ] ) ); 330 } 331 332 /** 333 * Method connect links the named source Taps and sink Tap to the given pipe assembly. 334 * <p> 335 * Since only once source Tap is given, it is assumed to be associated with the 'head' pipe. 336 * So the head pipe does not need to be included as an argument. 337 * 338 * @param name name to give the resulting Flow 339 * @param source source Tap to bind to the head of the given tail Pipes 340 * @param sinks all tail names and sink Taps to bind to the given tail Pipes 341 * @param tails all tail ends of a pipe assembly 342 * @return Flow 343 */ 344 public Flow connect( String name, Tap source, Map<String, Tap> sinks, Collection<Pipe> tails ) 345 { 346 return connect( name, source, sinks, tails.toArray( new Pipe[ tails.size() ] ) ); 347 } 348 349 /** 350 * Method connect links the named source Taps and sink Tap to the given pipe assembly. 351 * <p> 352 * Since only once source Tap is given, it is assumed to be associated with the 'head' pipe. 353 * So the head pipe does not need to be included as an argument. 354 * 355 * @param source source Tap to bind to the head of the given tail Pipes 356 * @param sinks all tail names and sink Taps to bind to the given tail Pipes 357 * @param tails all tail ends of a pipe assembly 358 * @return Flow 359 */ 360 public Flow connect( Tap source, Map<String, Tap> sinks, Pipe... tails ) 361 { 362 return connect( null, source, sinks, tails ); 363 } 364 365 /** 366 * Method connect links the named source Taps and sink Tap to the given pipe assembly. 367 * <p> 368 * Since only once source Tap is given, it is assumed to be associated with the 'head' pipe. 369 * So the head pipe does not need to be included as an argument. 370 * 371 * @param name name to give the resulting Flow 372 * @param source source Tap to bind to the head of the given tail Pipes 373 * @param sinks all tail names and sink Taps to bind to the given tail Pipes 374 * @param tails all tail ends of a pipe assembly 375 * @return Flow 376 */ 377 public Flow connect( String name, Tap source, Map<String, Tap> sinks, Pipe... tails ) 378 { 379 Set<Pipe> heads = new HashSet<Pipe>(); 380 381 for( Pipe pipe : tails ) 382 Collections.addAll( heads, pipe.getHeads() ); 383 384 if( heads.isEmpty() ) 385 throw new IllegalArgumentException( "no pipe instance found" ); 386 387 if( heads.size() != 1 ) 388 throw new IllegalArgumentException( "there may be only 1 head pipe instance, found " + heads.size() ); 389 390 Map<String, Tap> sources = new HashMap<String, Tap>(); 391 392 for( Pipe pipe : heads ) 393 sources.put( pipe.getName(), source ); 394 395 return connect( name, sources, sinks, tails ); 396 } 397 398 /** 399 * Method connect links the named sources and sinks to the given pipe assembly. 400 * 401 * @param sources all head names and source Taps to bind to the heads of the given tail Pipes 402 * @param sinks all tail names and sink Taps to bind to the given tail Pipes 403 * @param tails all tail ends of a pipe assembly 404 * @return Flow 405 */ 406 public Flow connect( Map<String, Tap> sources, Map<String, Tap> sinks, Pipe... tails ) 407 { 408 return connect( null, sources, sinks, tails ); 409 } 410 411 /** 412 * Method connect links the named sources and sinks to the given pipe assembly. 413 * 414 * @param name name to give the resulting Flow 415 * @param sources all head names and source Taps to bind to the heads of the given tail Pipes 416 * @param sinks all tail names and sink Taps to bind to the given tail Pipes 417 * @param tails all tail ends of a pipe assembly 418 * @return Flow 419 */ 420 public Flow connect( String name, Map<String, Tap> sources, Map<String, Tap> sinks, Pipe... tails ) 421 { 422 return connect( name, sources, sinks, new HashMap<String, Tap>(), tails ); 423 } 424 425 /** 426 * Method connect links the named sources, sinks and traps to the given pipe assembly. 427 * 428 * @param name name to give the resulting Flow 429 * @param sources all head names and source Taps to bind to the heads of the given tail Pipes 430 * @param sinks all tail names and sink Taps to bind to the given tail Pipes 431 * @param traps all pipe names and trap Taps to sink all failed Tuples into 432 * @param tails all tail ends of a pipe assembly 433 * @return Flow 434 */ 435 public Flow connect( String name, Map<String, Tap> sources, Map<String, Tap> sinks, Map<String, Tap> traps, Pipe... tails ) 436 { 437 name = name == null ? makeName( tails ) : name; 438 439 FlowDef flowDef = flowDef() 440 .setName( name ) 441 .addSources( sources ) 442 .addSinks( sinks ) 443 .addTraps( traps ) 444 .addTails( tails ); 445 446 return connect( flowDef ); 447 } 448 449 public Flow connect( FlowDef flowDef ) 450 { 451 FlowPlanner flowPlanner = createFlowPlanner(); 452 453 flowPlanner.initialize( this, properties ); 454 455 RuleRegistrySet ruleRegistrySet = getRuleRegistrySet(); 456 457 return flowPlanner.buildFlow( flowDef, ruleRegistrySet ); 458 } 459 460 protected abstract FlowPlanner createFlowPlanner(); 461 462 /** 463 * Returns the configured RuleRegistry, or the default for this platform. 464 * <p> 465 * The registry is mutable, and will be applied to all subsequent planner operations via {@link #connect(FlowDef)}. 466 * 467 * @return the current RuleRegistry instance 468 */ 469 public RuleRegistrySet getRuleRegistrySet() 470 { 471 if( ruleRegistrySet != null ) 472 return ruleRegistrySet; 473 474 ruleRegistrySet = createDefaultRuleRegistrySet(); 475 476 return ruleRegistrySet; 477 } 478 479 protected abstract RuleRegistrySet createDefaultRuleRegistrySet(); 480 481 /** 482 * Method getPlatformInfo returns an instance of {@link PlatformInfo} for the underlying platform. 483 * 484 * @return of type PlatformInfo 485 */ 486 public PlatformInfo getPlatformInfo() 487 { 488 return createFlowPlanner().getPlatformInfo(); 489 } 490 491 ///////// 492 // UTIL 493 ///////// 494 495 private String makeName( Pipe[] pipes ) 496 { 497 String[] names = new String[ pipes.length ]; 498 499 for( int i = 0; i < pipes.length; i++ ) 500 names[ i ] = pipes[ i ].getName(); 501 502 String name = Util.join( names, "+" ); 503 504 if( name.length() > 32 ) 505 name = name.substring( 0, 32 ); 506 507 return name; 508 } 509 }