001/*
002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.flow.hadoop;
023
024import java.beans.ConstructorProperties;
025import java.util.Map;
026import java.util.Properties;
027
028import cascading.flow.hadoop.util.HadoopUtil;
029import cascading.tap.Tap;
030import cascading.tap.hadoop.Hfs;
031import org.apache.hadoop.mapred.JobConf;
032
033/**
034 * Class MapReduceFlow is a {@link cascading.flow.hadoop.HadoopFlow} subclass that supports custom MapReduce jobs
035 * pre-configured via the {@link JobConf} object.
036 * <p>
037 * Use this class to allow custom MapReduce jobs to participate in the {@link cascading.cascade.Cascade} scheduler. If
038 * other Flow instances in the Cascade share resources with this Flow instance, all participants will be scheduled
039 * according to their dependencies (topologically).
040 * <p>
041 * Set the parameter {@code deleteSinkOnInit} to {@code true} if the outputPath in the jobConf should be deleted before executing the MapReduce job.
042 * <p>
043 * MapReduceFlow assumes the underlying input and output paths are compatible with the {@link Hfs} Tap.
044 * <p>
045 * If the configured JobConf instance uses some other identifier instead of Hadoop FS paths, you should override the
046 * {@link #createSources(org.apache.hadoop.mapred.JobConf)}, {@link #createSinks(org.apache.hadoop.mapred.JobConf)}, and
047 * {@link #createTraps(org.apache.hadoop.mapred.JobConf)} methods to properly resolve the configured paths into
048 * usable {@link Tap} instances. By default createTraps returns an empty collection and should probably be left alone.
049 * <p>
050 * MapReduceFlow supports both org.apache.hadoop.mapred.* and org.apache.hadoop.mapreduce.* API Jobs.
051 */
052public class MapReduceFlow extends BaseMapReduceFlow
053  {
054  /**
055   * Constructor MapReduceFlow creates a new MapReduceFlow instance.
056   *
057   * @param jobConf of type JobConf
058   */
059  @ConstructorProperties({"jobConf"})
060  public MapReduceFlow( JobConf jobConf )
061    {
062    this( jobConf.getJobName(), jobConf, false );
063    }
064
065  /**
066   * Constructor MapReduceFlow creates a new MapReduceFlow instance.
067   *
068   * @param jobConf          of type JobConf
069   * @param deleteSinkOnInit of type boolean
070   */
071  @ConstructorProperties({"jobConf", "deleteSinkOnInit"})
072  public MapReduceFlow( JobConf jobConf, boolean deleteSinkOnInit )
073    {
074    this( jobConf.getJobName(), jobConf, deleteSinkOnInit );
075    }
076
077  /**
078   * Constructor MapReduceFlow creates a new MapReduceFlow instance.
079   *
080   * @param name    of type String
081   * @param jobConf of type JobConf
082   */
083  @ConstructorProperties({"name", "jobConf"})
084  public MapReduceFlow( String name, JobConf jobConf )
085    {
086    this( name, jobConf, false );
087    }
088
089  /**
090   * Constructor MapReduceFlow creates a new MapReduceFlow instance.
091   *
092   * @param name             of type String
093   * @param jobConf          of type JobConf
094   * @param deleteSinkOnInit of type boolean
095   */
096  @ConstructorProperties({"name", "jobConf", "deleteSinkOnInit"})
097  public MapReduceFlow( String name, JobConf jobConf, boolean deleteSinkOnInit )
098    {
099    this( new Properties(), name, jobConf, null, deleteSinkOnInit, true );
100    }
101
102  /**
103   * Constructor MapReduceFlow creates a new MapReduceFlow instance.
104   *
105   * @param properties       of type Properties
106   * @param name             of type String
107   * @param jobConf          of type JobConf
108   * @param deleteSinkOnInit of type boolean
109   */
110  @ConstructorProperties({"properties", "name", "jobConf", "deleteSinkOnInit"})
111  public MapReduceFlow( Properties properties, String name, JobConf jobConf, boolean deleteSinkOnInit )
112    {
113    this( properties, name, jobConf, null, deleteSinkOnInit, true );
114    }
115
116  /**
117   * Constructor MapReduceFlow creates a new MapReduceFlow instance.
118   *
119   * @param properties       of type Properties
120   * @param name             of type String
121   * @param jobConf          of type JobConf
122   * @param flowDescriptor   of type Map
123   * @param deleteSinkOnInit of type boolean
124   */
125  @ConstructorProperties({"properties", "name", "jobConf", "flowDescriptor", "deleteSinkOnInit"})
126  public MapReduceFlow( Properties properties, String name, JobConf jobConf, Map<String, String> flowDescriptor, boolean deleteSinkOnInit )
127    {
128    this( properties, name, jobConf, flowDescriptor, deleteSinkOnInit, true );
129    }
130
131  /**
132   * Constructor MapReduceFlow creates a new MapReduceFlow instance.
133   *
134   * @param properties       of type Properties
135   * @param name             of type String
136   * @param jobConf          of type JobConf
137   * @param flowDescriptor   of type Map
138   * @param deleteSinkOnInit of type boolean
139   * @param stopJobsOnExit   of type boolean
140   */
141  @ConstructorProperties({"properties", "name", "jobConf", "flowDescriptor", "deleteSinkOnInit", "stopJobsOnExit"})
142  public MapReduceFlow( Properties properties, String name, JobConf jobConf, Map<String, String> flowDescriptor, boolean deleteSinkOnInit, boolean stopJobsOnExit )
143    {
144    super( HadoopUtil.getPlatformInfo( JobConf.class, "org/apache/hadoop", "Hadoop MR" ), properties, jobConf, name, flowDescriptor, deleteSinkOnInit );
145    this.stopJobsOnExit = stopJobsOnExit;
146
147    initializeFrom( jobConf ); // push off initialization allowing for overrides
148    }
149
150  protected void initializeFrom( JobConf jobConf )
151    {
152    setSources( createSources( jobConf ) );
153    setSinks( createSinks( jobConf ) );
154    setTraps( createTraps( jobConf ) );
155    setFlowStepGraph( makeStepGraph( jobConf ) );
156
157    // this mirrors BaseFlow#initialize()
158
159    initSteps();
160
161    this.flowStats = createPrepareFlowStats(); // must be last
162
163    initializeNewJobsMap();
164
165    initializeChildStats();
166    }
167  }