001/*
002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.flow.hadoop.planner;
023
024import java.net.URI;
025import java.util.Map;
026import java.util.Properties;
027import java.util.Set;
028
029import cascading.flow.FlowConnector;
030import cascading.flow.FlowConnectorProps;
031import cascading.flow.FlowDef;
032import cascading.flow.FlowStep;
033import cascading.flow.hadoop.HadoopFlow;
034import cascading.flow.hadoop.HadoopFlowStep;
035import cascading.flow.hadoop.util.HadoopUtil;
036import cascading.flow.planner.BaseFlowStepFactory;
037import cascading.flow.planner.FlowPlanner;
038import cascading.flow.planner.PlannerInfo;
039import cascading.flow.planner.PlatformInfo;
040import cascading.flow.planner.graph.ElementGraph;
041import cascading.flow.planner.process.FlowNodeGraph;
042import cascading.flow.planner.process.FlowStepFactory;
043import cascading.flow.planner.rule.RuleRegistry;
044import cascading.flow.planner.rule.transformer.IntermediateTapElementFactory;
045import cascading.property.AppProps;
046import cascading.property.PropertyUtil;
047import cascading.tap.Tap;
048import cascading.tap.hadoop.DistCacheTap;
049import cascading.tap.hadoop.Hfs;
050import cascading.tap.hadoop.util.TempHfs;
051import cascading.util.Util;
052import org.apache.hadoop.conf.Configuration;
053import org.apache.hadoop.mapred.JobConf;
054import org.slf4j.Logger;
055import org.slf4j.LoggerFactory;
056
057/**
058 * Class HadoopPlanner is the core Hadoop MapReduce planner used by default through a {@link cascading.flow.FlowConnector}
059 * sub-class.
060 * <p>
061 * Notes:
062 * <p>
063 * <strong>Custom JobConf properties</strong><br>
064 * A custom JobConf instance can be passed to this planner by calling {@link #copyJobConf(java.util.Map, org.apache.hadoop.mapred.JobConf)}
065 * on a map properties object before constructing a new {@link cascading.flow.FlowConnector} sub-class.
066 * <p>
067 * A better practice would be to set Hadoop properties directly on the map properties object handed to the FlowConnector.
068 * All values in the map will be passed to a new default JobConf instance to be used as defaults for all resulting
069 * Flow instances.
070 * <p>
071 * For example, {@code properties.set("mapred.child.java.opts","-Xmx512m");} would convince Hadoop
072 * to spawn all child jvms with a heap of 512MB.
073 */
074public class HadoopPlanner extends FlowPlanner<HadoopFlow, JobConf>
075  {
076  /** Field LOG */
077  private static final Logger LOG = LoggerFactory.getLogger( HadoopPlanner.class );
078
079  public static final String PLATFORM_NAME = "hadoop";
080
081  /** Field jobConf */
082  private JobConf defaultJobConf;
083  /** Field intermediateSchemeClass */
084  private Class intermediateSchemeClass;
085
086  /**
087   * Method copyJobConf adds the given JobConf values to the given properties object. Use this method to pass
088   * custom default Hadoop JobConf properties to Hadoop.
089   *
090   * @param properties of type Map
091   * @param jobConf    of type JobConf
092   */
093  public static void copyJobConf( Map<Object, Object> properties, JobConf jobConf )
094    {
095    for( Map.Entry<String, String> entry : jobConf )
096      properties.put( entry.getKey(), entry.getValue() );
097    }
098
099  /**
100   * Method createJobConf returns a new JobConf instance using the values in the given properties argument.
101   *
102   * @param properties of type Map
103   * @return a JobConf instance
104   */
105  public static JobConf createJobConf( Map<Object, Object> properties )
106    {
107    JobConf conf = new JobConf();
108
109    copyProperties( conf, properties );
110
111    return conf;
112    }
113
114  /**
115   * Method copyProperties adds the given Map values to the given JobConf object.
116   *
117   * @param jobConf    of type JobConf
118   * @param properties of type Map
119   */
120  public static void copyProperties( JobConf jobConf, Map<Object, Object> properties )
121    {
122    if( properties instanceof Properties )
123      {
124      Properties props = (Properties) properties;
125      Set<String> keys = props.stringPropertyNames();
126
127      for( String key : keys )
128        jobConf.set( key, props.getProperty( key ) );
129      }
130    else
131      {
132      for( Map.Entry<Object, Object> entry : properties.entrySet() )
133        {
134        if( entry.getValue() != null )
135          jobConf.set( entry.getKey().toString(), entry.getValue().toString() );
136        }
137      }
138    }
139
140  @Override
141  public PlannerInfo getPlannerInfo( String registryName )
142    {
143    return new PlannerInfo( getClass().getSimpleName(), PLATFORM_NAME, registryName );
144    }
145
146  @Override
147  public JobConf getDefaultConfig()
148    {
149    return defaultJobConf;
150    }
151
152  @Override
153  public PlatformInfo getPlatformInfo()
154    {
155    return HadoopUtil.getPlatformInfo( JobConf.class, "org/apache/hadoop", "Hadoop MR" );
156    }
157
158  @Override
159  public void initialize( FlowConnector flowConnector, Map<Object, Object> properties )
160    {
161    super.initialize( flowConnector, properties );
162
163    defaultJobConf = HadoopUtil.createJobConf( properties, createJobConf( properties ) );
164    checkPlatform( defaultJobConf );
165    intermediateSchemeClass = flowConnector.getIntermediateSchemeClass( properties );
166
167    Class type = AppProps.getApplicationJarClass( properties );
168    if( defaultJobConf.getJar() == null && type != null )
169      defaultJobConf.setJarByClass( type );
170
171    String path = AppProps.getApplicationJarPath( properties );
172    if( defaultJobConf.getJar() == null && path != null )
173      defaultJobConf.setJar( path );
174
175    if( defaultJobConf.getJar() == null )
176      defaultJobConf.setJarByClass( HadoopUtil.findMainClass( HadoopPlanner.class ) );
177
178    AppProps.setApplicationJarPath( properties, defaultJobConf.getJar() );
179
180    LOG.info( "using application jar: {}", defaultJobConf.getJar() );
181    }
182
183  @Override
184  public void configRuleRegistryDefaults( RuleRegistry ruleRegistry )
185    {
186    super.configRuleRegistryDefaults( ruleRegistry );
187
188    ruleRegistry.addDefaultElementFactory( IntermediateTapElementFactory.TEMP_TAP, new TempTapElementFactory() );
189
190    if( PropertyUtil.getBooleanProperty( getDefaultProperties(), FlowConnectorProps.ENABLE_DECORATE_ACCUMULATED_TAP, true ) )
191      ruleRegistry.addDefaultElementFactory( IntermediateTapElementFactory.ACCUMULATED_TAP, new TempTapElementFactory( DistCacheTap.class.getName() ) );
192    }
193
194  protected void checkPlatform( Configuration conf )
195    {
196    if( HadoopUtil.isYARN( conf ) )
197      LOG.warn( "running YARN based flows on Hadoop 1.x may cause problems, please use the 'cascading-hadoop2-mr1' dependencies" );
198    }
199
200  @Override
201  protected HadoopFlow createFlow( FlowDef flowDef )
202    {
203    return new HadoopFlow( getPlatformInfo(), getDefaultProperties(), getDefaultConfig(), flowDef );
204    }
205
206  @Override
207  public FlowStepFactory<JobConf> getFlowStepFactory()
208    {
209    return new BaseFlowStepFactory<JobConf>( getFlowNodeFactory() )
210      {
211      @Override
212      public FlowStep<JobConf> createFlowStep( ElementGraph stepElementGraph, FlowNodeGraph flowNodeGraph )
213        {
214        return new HadoopFlowStep( stepElementGraph, flowNodeGraph );
215        }
216      };
217    }
218
219  public URI getDefaultURIScheme( Tap tap )
220    {
221    return ( (Hfs) tap ).getDefaultFileSystemURIScheme( defaultJobConf );
222    }
223
224  public URI getURIScheme( Tap tap )
225    {
226    return ( (Hfs) tap ).getURIScheme( defaultJobConf );
227    }
228
229  @Override
230  protected Tap makeTempTap( String prefix, String name )
231    {
232    // must give Taps unique names
233    return new TempHfs( defaultJobConf, Util.makePath( prefix, name ), intermediateSchemeClass, prefix == null );
234    }
235  }