001/* 002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.flow.hadoop; 023 024import java.beans.ConstructorProperties; 025import java.util.Map; 026import java.util.Properties; 027 028import cascading.flow.hadoop.util.HadoopUtil; 029import cascading.tap.Tap; 030import cascading.tap.hadoop.Hfs; 031import org.apache.hadoop.mapred.JobConf; 032 033/** 034 * Class MapReduceFlow is a {@link cascading.flow.hadoop.HadoopFlow} subclass that supports custom MapReduce jobs 035 * pre-configured via the {@link JobConf} object. 036 * <p> 037 * Use this class to allow custom MapReduce jobs to participate in the {@link cascading.cascade.Cascade} scheduler. If 038 * other Flow instances in the Cascade share resources with this Flow instance, all participants will be scheduled 039 * according to their dependencies (topologically). 040 * <p> 041 * Set the parameter {@code deleteSinkOnInit} to {@code true} if the outputPath in the jobConf should be deleted before executing the MapReduce job. 042 * <p> 043 * MapReduceFlow assumes the underlying input and output paths are compatible with the {@link Hfs} Tap. 044 * <p> 045 * If the configured JobConf instance uses some other identifier instead of Hadoop FS paths, you should override the 046 * {@link #createSources(org.apache.hadoop.mapred.JobConf)}, {@link #createSinks(org.apache.hadoop.mapred.JobConf)}, and 047 * {@link #createTraps(org.apache.hadoop.mapred.JobConf)} methods to properly resolve the configured paths into 048 * usable {@link Tap} instances. By default createTraps returns an empty collection and should probably be left alone. 049 * <p> 050 * MapReduceFlow supports both org.apache.hadoop.mapred.* and org.apache.hadoop.mapreduce.* API Jobs. 051 */ 052public class MapReduceFlow extends BaseMapReduceFlow 053 { 054 /** 055 * Constructor MapReduceFlow creates a new MapReduceFlow instance. 056 * 057 * @param jobConf of type JobConf 058 */ 059 @ConstructorProperties({"jobConf"}) 060 public MapReduceFlow( JobConf jobConf ) 061 { 062 this( jobConf.getJobName(), jobConf, false ); 063 } 064 065 /** 066 * Constructor MapReduceFlow creates a new MapReduceFlow instance. 067 * 068 * @param jobConf of type JobConf 069 * @param deleteSinkOnInit of type boolean 070 */ 071 @ConstructorProperties({"jobConf", "deleteSinkOnInit"}) 072 public MapReduceFlow( JobConf jobConf, boolean deleteSinkOnInit ) 073 { 074 this( jobConf.getJobName(), jobConf, deleteSinkOnInit ); 075 } 076 077 /** 078 * Constructor MapReduceFlow creates a new MapReduceFlow instance. 079 * 080 * @param name of type String 081 * @param jobConf of type JobConf 082 */ 083 @ConstructorProperties({"name", "jobConf"}) 084 public MapReduceFlow( String name, JobConf jobConf ) 085 { 086 this( name, jobConf, false ); 087 } 088 089 /** 090 * Constructor MapReduceFlow creates a new MapReduceFlow instance. 091 * 092 * @param name of type String 093 * @param jobConf of type JobConf 094 * @param deleteSinkOnInit of type boolean 095 */ 096 @ConstructorProperties({"name", "jobConf", "deleteSinkOnInit"}) 097 public MapReduceFlow( String name, JobConf jobConf, boolean deleteSinkOnInit ) 098 { 099 this( new Properties(), name, jobConf, null, deleteSinkOnInit, true ); 100 } 101 102 /** 103 * Constructor MapReduceFlow creates a new MapReduceFlow instance. 104 * 105 * @param properties of type Properties 106 * @param name of type String 107 * @param jobConf of type JobConf 108 * @param deleteSinkOnInit of type boolean 109 */ 110 @ConstructorProperties({"properties", "name", "jobConf", "deleteSinkOnInit"}) 111 public MapReduceFlow( Properties properties, String name, JobConf jobConf, boolean deleteSinkOnInit ) 112 { 113 this( properties, name, jobConf, null, deleteSinkOnInit, true ); 114 } 115 116 /** 117 * Constructor MapReduceFlow creates a new MapReduceFlow instance. 118 * 119 * @param properties of type Properties 120 * @param name of type String 121 * @param jobConf of type JobConf 122 * @param flowDescriptor of type Map 123 * @param deleteSinkOnInit of type boolean 124 */ 125 @ConstructorProperties({"properties", "name", "jobConf", "flowDescriptor", "deleteSinkOnInit"}) 126 public MapReduceFlow( Properties properties, String name, JobConf jobConf, Map<String, String> flowDescriptor, boolean deleteSinkOnInit ) 127 { 128 this( properties, name, jobConf, flowDescriptor, deleteSinkOnInit, true ); 129 } 130 131 /** 132 * Constructor MapReduceFlow creates a new MapReduceFlow instance. 133 * 134 * @param properties of type Properties 135 * @param name of type String 136 * @param jobConf of type JobConf 137 * @param flowDescriptor of type Map 138 * @param deleteSinkOnInit of type boolean 139 * @param stopJobsOnExit of type boolean 140 */ 141 @ConstructorProperties({"properties", "name", "jobConf", "flowDescriptor", "deleteSinkOnInit", "stopJobsOnExit"}) 142 public MapReduceFlow( Properties properties, String name, JobConf jobConf, Map<String, String> flowDescriptor, boolean deleteSinkOnInit, boolean stopJobsOnExit ) 143 { 144 super( HadoopUtil.getPlatformInfo( JobConf.class, "org/apache/hadoop", "Hadoop MR" ), properties, jobConf, name, flowDescriptor, deleteSinkOnInit ); 145 this.stopJobsOnExit = stopJobsOnExit; 146 147 initializeFrom( jobConf ); // push off initialization allowing for overrides 148 } 149 150 protected void initializeFrom( JobConf jobConf ) 151 { 152 setSources( createSources( jobConf ) ); 153 setSinks( createSinks( jobConf ) ); 154 setTraps( createTraps( jobConf ) ); 155 setFlowStepGraph( makeStepGraph( jobConf ) ); 156 157 // this mirrors BaseFlow#initialize() 158 159 initSteps(); 160 161 this.flowStats = createPrepareFlowStats(); // must be last 162 163 initializeNewJobsMap(); 164 165 initializeChildStats(); 166 } 167 }