001/* 002 * Copyright (c) 2016-2018 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.flow.hadoop.util; 023 024import java.io.IOException; 025import java.io.UnsupportedEncodingException; 026import java.lang.reflect.Constructor; 027import java.lang.reflect.Field; 028import java.lang.reflect.InvocationTargetException; 029import java.net.URI; 030import java.net.URL; 031import java.util.Collection; 032import java.util.Collections; 033import java.util.HashMap; 034import java.util.HashSet; 035import java.util.Iterator; 036import java.util.List; 037import java.util.Map; 038import java.util.Properties; 039import java.util.Set; 040import java.util.jar.Attributes; 041import java.util.jar.Manifest; 042 043import cascading.CascadingException; 044import cascading.flow.FlowException; 045import cascading.flow.planner.BaseFlowStep; 046import cascading.flow.planner.PlatformInfo; 047import cascading.flow.planner.Scope; 048import cascading.pipe.Group; 049import cascading.scheme.hadoop.TextLine; 050import cascading.tap.hadoop.Hfs; 051import cascading.tuple.Fields; 052import cascading.util.LogUtil; 053import cascading.util.Util; 054import org.apache.commons.codec.binary.Base64; 055import org.apache.hadoop.conf.Configurable; 056import org.apache.hadoop.conf.Configuration; 057import org.apache.hadoop.fs.FileStatus; 058import org.apache.hadoop.fs.FileSystem; 059import org.apache.hadoop.fs.LocalFileSystem; 060import org.apache.hadoop.fs.Path; 061import org.apache.hadoop.mapred.JobConf; 062import org.apache.hadoop.util.StringUtils; 063import org.slf4j.Logger; 064import org.slf4j.LoggerFactory; 065 066import static cascading.util.Util.invokeInstanceMethod; 067 068/** 069 * 070 */ 071public class HadoopUtil 072 { 073 public static final String CASCADING_FLOW_EXECUTING = "cascading.flow.executing"; 074 075 private static final Logger LOG = LoggerFactory.getLogger( HadoopUtil.class ); 076 private static final String ENCODING = "US-ASCII"; 077 private static final Class<?> DEFAULT_OBJECT_SERIALIZER = JavaObjectSerializer.class; 078 079 private static PlatformInfo platformInfo; 080 081 public static void setIsInflow( Configuration conf ) 082 { 083 conf.setBoolean( CASCADING_FLOW_EXECUTING, true ); 084 } 085 086 public static boolean isInflow( Configuration conf ) 087 { 088 return conf.getBoolean( CASCADING_FLOW_EXECUTING, false ); 089 } 090 091 public static void initLog4j( JobConf configuration ) 092 { 093 initLog4j( (Configuration) configuration ); 094 } 095 096 public static void initLog4j( Configuration configuration ) 097 { 098 String values = configuration.get( "log4j.logger", null ); 099 100 if( values == null || values.length() == 0 ) 101 return; 102 103 if( !Util.hasClass( "org.apache.log4j.Logger" ) ) 104 { 105 LOG.info( "org.apache.log4j.Logger is not in the current CLASSPATH, not setting log4j.logger properties" ); 106 return; 107 } 108 109 String[] elements = values.split( "," ); 110 111 for( String element : elements ) 112 LogUtil.setLog4jLevel( element.split( "=" ) ); 113 } 114 115 // only place JobConf should ever be returned 116 public static JobConf asJobConfInstance( Configuration configuration ) 117 { 118 if( configuration instanceof JobConf ) 119 return (JobConf) configuration; 120 121 return new JobConf( configuration ); 122 } 123 124 public static <C> C copyJobConf( C parentJobConf ) 125 { 126 return copyConfiguration( parentJobConf ); 127 } 128 129 public static JobConf copyJobConf( JobConf parentJobConf ) 130 { 131 if( parentJobConf == null ) 132 throw new IllegalArgumentException( "parent may not be null" ); 133 134 // see https://github.com/Cascading/cascading/pull/21 135 // The JobConf(JobConf) constructor causes derived JobConfs to share Credentials. We want to avoid this, in 136 // case those Credentials are mutated later on down the road (which they will be, during job submission, in 137 // separate threads!). Using the JobConf(Configuration) constructor avoids Credentials-sharing. 138 final Configuration configurationCopy = new Configuration( parentJobConf ); 139 final JobConf jobConf = new JobConf( configurationCopy ); 140 141 jobConf.getCredentials().addAll( parentJobConf.getCredentials() ); 142 143 return jobConf; 144 } 145 146 public static JobConf createJobConf( Map<Object, Object> properties ) 147 { 148 return createJobConf( properties, null ); 149 } 150 151 public static JobConf createJobConf( Map<Object, Object> properties, JobConf defaultJobconf ) 152 { 153 JobConf jobConf = defaultJobconf == null ? new JobConf() : copyJobConf( defaultJobconf ); 154 155 if( properties == null ) 156 return jobConf; 157 158 return copyConfiguration( properties, jobConf ); 159 } 160 161 public static <C> C copyConfiguration( C parent ) 162 { 163 if( parent == null ) 164 throw new IllegalArgumentException( "parent may not be null" ); 165 166 if( !( parent instanceof Configuration ) ) 167 throw new IllegalArgumentException( "parent must be of type Configuration" ); 168 169 Configuration conf = (Configuration) parent; 170 171 // see https://github.com/Cascading/cascading/pull/21 172 // The JobConf(JobConf) constructor causes derived JobConfs to share Credentials. We want to avoid this, in 173 // case those Credentials are mutated later on down the road (which they will be, during job submission, in 174 // separate threads!). Using the JobConf(Configuration) constructor avoids Credentials-sharing. 175 Configuration configurationCopy = new Configuration( conf ); 176 177 Configuration copiedConf = callCopyConstructor( parent.getClass(), configurationCopy ); 178 179 if( Util.hasInstanceMethod( parent, "getCredentials", null ) ) 180 { 181 Object result = invokeInstanceMethod( parent, "getCredentials", null, null ); 182 Object credentials = invokeInstanceMethod( copiedConf, "getCredentials", null, null ); 183 184 invokeInstanceMethod( credentials, "addAll", new Object[]{result}, new Class[]{credentials.getClass()} ); 185 } 186 187 return (C) copiedConf; 188 } 189 190 protected static <C extends Configuration> C callCopyConstructor( Class type, Configuration parent ) 191 { 192 try 193 { 194 Constructor<C> constructor = type.getConstructor( parent.getClass() ); 195 196 return constructor.newInstance( parent ); 197 } 198 catch( NoSuchMethodException | InvocationTargetException | InstantiationException | IllegalAccessException exception ) 199 { 200 throw new CascadingException( "unable to create copy of: " + type ); 201 } 202 } 203 204 public static <C extends Configuration> C copyConfiguration( Map<Object, Object> srcProperties, C dstConfiguration ) 205 { 206 Set<Object> keys = new HashSet<Object>( srcProperties.keySet() ); 207 208 // keys will only be grabbed if both key/value are String, so keep orig keys 209 if( srcProperties instanceof Properties ) 210 keys.addAll( ( (Properties) srcProperties ).stringPropertyNames() ); 211 212 for( Object key : keys ) 213 { 214 Object value = srcProperties.get( key ); 215 216 if( value == null && srcProperties instanceof Properties && key instanceof String ) 217 value = ( (Properties) srcProperties ).getProperty( (String) key ); 218 219 if( value == null ) // don't stuff null values 220 continue; 221 222 // don't let these objects pass, even though toString is called below. 223 if( value instanceof Class || value instanceof JobConf ) 224 continue; 225 226 dstConfiguration.set( key.toString(), value.toString() ); 227 } 228 229 return dstConfiguration; 230 } 231 232 public static Map<Object, Object> createProperties( Configuration jobConf ) 233 { 234 Map<Object, Object> properties = new HashMap<Object, Object>(); 235 236 if( jobConf == null ) 237 return properties; 238 239 for( Map.Entry<String, String> entry : jobConf ) 240 properties.put( entry.getKey(), entry.getValue() ); 241 242 return properties; 243 } 244 245 public static Thread getHDFSShutdownHook() 246 { 247 Exception caughtException; 248 249 try 250 { 251 // we must init the FS so the finalizer is registered 252 FileSystem.getLocal( new JobConf() ); 253 254 Field field = FileSystem.class.getDeclaredField( "clientFinalizer" ); 255 field.setAccessible( true ); 256 257 Thread finalizer = (Thread) field.get( null ); 258 259 if( finalizer != null ) 260 Runtime.getRuntime().removeShutdownHook( finalizer ); 261 262 return finalizer; 263 } 264 catch( NoSuchFieldException exception ) 265 { 266 caughtException = exception; 267 } 268 catch( IllegalAccessException exception ) 269 { 270 caughtException = exception; 271 } 272 catch( IOException exception ) 273 { 274 caughtException = exception; 275 } 276 277 LOG.debug( "unable to find and remove client hdfs shutdown hook, received exception: {}", caughtException.getClass().getName() ); 278 279 return null; 280 } 281 282 public static String encodeBytes( byte[] bytes ) 283 { 284 try 285 { 286 return new String( Base64.encodeBase64( bytes ), ENCODING ); 287 } 288 catch( UnsupportedEncodingException exception ) 289 { 290 throw new RuntimeException( exception ); 291 } 292 } 293 294 public static byte[] decodeBytes( String string ) 295 { 296 try 297 { 298 byte[] bytes = string.getBytes( ENCODING ); 299 return Base64.decodeBase64( bytes ); 300 } 301 catch( UnsupportedEncodingException exception ) 302 { 303 throw new RuntimeException( exception ); 304 } 305 } 306 307 public static <T> ObjectSerializer instantiateSerializer( Configuration conf, Class<T> type ) throws ClassNotFoundException 308 { 309 Class<ObjectSerializer> flowSerializerClass; 310 311 String serializerClassName = conf.get( ObjectSerializer.OBJECT_SERIALIZER_PROPERTY ); 312 313 if( serializerClassName == null || serializerClassName.length() == 0 ) 314 flowSerializerClass = (Class<ObjectSerializer>) DEFAULT_OBJECT_SERIALIZER; 315 else 316 flowSerializerClass = (Class<ObjectSerializer>) Class.forName( serializerClassName ); 317 318 ObjectSerializer objectSerializer; 319 320 try 321 { 322 objectSerializer = flowSerializerClass.newInstance(); 323 324 if( objectSerializer instanceof Configurable ) 325 ( (Configurable) objectSerializer ).setConf( conf ); 326 } 327 catch( Exception exception ) 328 { 329 exception.printStackTrace(); 330 throw new IllegalArgumentException( "Unable to instantiate serializer \"" 331 + flowSerializerClass.getName() 332 + "\" for class: " 333 + type.getName() ); 334 } 335 336 if( !objectSerializer.accepts( type ) ) 337 throw new IllegalArgumentException( serializerClassName + " won't accept objects of class " + type.toString() ); 338 339 return objectSerializer; 340 } 341 342 public static <T> String serializeBase64( T object, Configuration conf ) throws IOException 343 { 344 return serializeBase64( object, conf, true ); 345 } 346 347 public static <T> String serializeBase64( T object, Configuration conf, boolean compress ) throws IOException 348 { 349 ObjectSerializer objectSerializer; 350 351 try 352 { 353 objectSerializer = instantiateSerializer( conf, object.getClass() ); 354 } 355 catch( ClassNotFoundException exception ) 356 { 357 throw new IOException( exception ); 358 } 359 360 return encodeBytes( objectSerializer.serialize( object, compress ) ); 361 } 362 363 /** 364 * This method deserializes the Base64 encoded String into an Object instance. 365 * 366 * @param string 367 * @return an Object 368 */ 369 public static <T> T deserializeBase64( String string, Configuration conf, Class<T> type ) throws IOException 370 { 371 return deserializeBase64( string, conf, type, true ); 372 } 373 374 public static <T> T deserializeBase64( String string, Configuration conf, Class<T> type, boolean decompress ) throws IOException 375 { 376 if( string == null || string.length() == 0 ) 377 return null; 378 379 ObjectSerializer objectSerializer; 380 381 try 382 { 383 objectSerializer = instantiateSerializer( conf, type ); 384 } 385 catch( ClassNotFoundException exception ) 386 { 387 throw new IOException( exception ); 388 } 389 390 return objectSerializer.deserialize( decodeBytes( string ), type, decompress ); 391 } 392 393 public static Class findMainClass( Class defaultType ) 394 { 395 return Util.findMainClass( defaultType, "org.apache.hadoop" ); 396 } 397 398 public static Map<String, String> getConfig( Configuration defaultConf, Configuration updatedConf ) 399 { 400 Map<String, String> configs = new HashMap<String, String>(); 401 402 for( Map.Entry<String, String> entry : updatedConf ) 403 configs.put( entry.getKey(), entry.getValue() ); 404 405 for( Map.Entry<String, String> entry : defaultConf ) 406 { 407 if( entry.getValue() == null ) 408 continue; 409 410 String updatedValue = configs.get( entry.getKey() ); 411 412 // if both null, lets purge from map to save space 413 if( updatedValue == null && entry.getValue() == null ) 414 configs.remove( entry.getKey() ); 415 416 // if the values are the same, lets also purge from map to save space 417 if( updatedValue != null && updatedValue.equals( entry.getValue() ) ) 418 configs.remove( entry.getKey() ); 419 420 configs.remove( "mapred.working.dir" ); 421 configs.remove( "mapreduce.job.working.dir" ); // hadoop2 422 } 423 424 return configs; 425 } 426 427 public static JobConf[] getJobConfs( Configuration job, List<Map<String, String>> configs ) 428 { 429 JobConf[] jobConfs = new JobConf[ configs.size() ]; 430 431 for( int i = 0; i < jobConfs.length; i++ ) 432 jobConfs[ i ] = (JobConf) mergeConf( job, configs.get( i ), false ); 433 434 return jobConfs; 435 } 436 437 public static <J extends Configuration> J mergeConf( J job, Map<String, String> config, boolean directly ) 438 { 439 Configuration currentConf = directly ? job : ( job instanceof JobConf ? copyJobConf( (JobConf) job ) : new Configuration( job ) ); 440 441 for( String key : config.keySet() ) 442 { 443 LOG.debug( "merging key: {} value: {}", key, config.get( key ) ); 444 445 currentConf.set( key, config.get( key ) ); 446 } 447 448 return (J) currentConf; 449 } 450 451 public static Configuration removePropertiesFrom( Configuration jobConf, String... keys ) 452 { 453 Map<Object, Object> properties = createProperties( jobConf ); 454 455 for( String key : keys ) 456 properties.remove( key ); 457 458 return copyConfiguration( properties, new JobConf() ); 459 } 460 461 public static boolean removeStateFromDistCache( Configuration conf, String path ) throws IOException 462 { 463 return new Hfs( new TextLine(), path ).deleteResource( conf ); 464 } 465 466 public static PlatformInfo getPlatformInfo() 467 { 468 if( platformInfo == null ) 469 platformInfo = getPlatformInfoInternal( JobConf.class, "org/apache/hadoop", "Hadoop" ); 470 471 return platformInfo; 472 } 473 474 public static PlatformInfo getPlatformInfo( Class type, String attributePath, String platformName ) 475 { 476 if( platformInfo == null ) 477 platformInfo = getPlatformInfoInternal( type, attributePath, platformName ); 478 479 return platformInfo; 480 } 481 482 public static PlatformInfo createPlatformInfo( Class type, String attributePath, String platformName ) 483 { 484 return getPlatformInfoInternal( type, attributePath, platformName ); 485 } 486 487 private static PlatformInfo getPlatformInfoInternal( Class type, String attributePath, String platformName ) 488 { 489 URL url = type.getResource( type.getSimpleName() + ".class" ); 490 491 if( url == null || !url.toString().startsWith( "jar" ) ) 492 return new PlatformInfo( platformName, null, null ); 493 494 String path = url.toString(); 495 path = path.substring( 0, path.lastIndexOf( "!" ) + 1 ); 496 497 String manifestPath = path + "/META-INF/MANIFEST.MF"; 498 String parsedVersion = Util.findVersion( path.substring( 0, path.length() - 1 ) ); 499 500 Manifest manifest; 501 502 try 503 { 504 manifest = new Manifest( new URL( manifestPath ).openStream() ); 505 } 506 catch( IOException exception ) 507 { 508 LOG.warn( "unable to get manifest from {}: {}", manifestPath, exception.getMessage() ); 509 510 return new PlatformInfo( platformName, null, parsedVersion ); 511 } 512 513 Attributes attributes = manifest.getAttributes( attributePath ); 514 515 if( attributes == null ) 516 attributes = manifest.getMainAttributes(); 517 518 if( attributes == null ) 519 { 520 LOG.debug( "unable to get platform manifest attributes" ); 521 return new PlatformInfo( platformName, null, parsedVersion ); 522 } 523 524 String vendor = attributes.getValue( "Implementation-Vendor" ); 525 String version = attributes.getValue( "Implementation-Version" ); 526 527 if( Util.isEmpty( version ) ) 528 version = parsedVersion; 529 530 return new PlatformInfo( platformName, vendor, version ); 531 } 532 533 /** 534 * Copies paths from one local path to a remote path. If syncTimes is true, both modification and access time are 535 * changed to match the local 'from' path. 536 * <p> 537 * Returns a map of file-name to remote modification times if the remote time is different than the local time. 538 * 539 * @param config 540 * @param commonPaths 541 * @param syncTimes 542 */ 543 public static Map<String, Long> syncPaths( Configuration config, Map<Path, Path> commonPaths, boolean syncTimes ) 544 { 545 if( commonPaths == null ) 546 return Collections.emptyMap(); 547 548 Map<String, Long> timestampMap = new HashMap<>(); 549 550 Map<Path, Path> copyPaths = getCopyPaths( config, commonPaths ); // tests remote file existence or if stale 551 552 LocalFileSystem localFS = getLocalFS( config ); 553 FileSystem remoteFS = getDefaultFS( config ); 554 555 for( Map.Entry<Path, Path> entry : copyPaths.entrySet() ) 556 { 557 Path localPath = entry.getKey(); 558 Path remotePath = entry.getValue(); 559 560 try 561 { 562 LOG.info( "copying from: {}, to: {}", localPath, remotePath ); 563 remoteFS.copyFromLocalFile( localPath, remotePath ); 564 565 if( !syncTimes ) 566 { 567 timestampMap.put( remotePath.getName(), remoteFS.getFileStatus( remotePath ).getModificationTime() ); 568 continue; 569 } 570 } 571 catch( IOException exception ) 572 { 573 throw new FlowException( "unable to copy local: " + localPath + " to remote: " + remotePath, exception ); 574 } 575 576 FileStatus localFileStatus = null; 577 578 try 579 { 580 // sync the modified times so we can lazily upload jars to hdfs after job is started 581 // otherwise modified time will be local to hdfs 582 localFileStatus = localFS.getFileStatus( localPath ); 583 remoteFS.setTimes( remotePath, localFileStatus.getModificationTime(), -1 ); // don't set the access time 584 } 585 catch( IOException exception ) 586 { 587 LOG.info( "unable to set local modification time on remote file: {}, 'dfs.namenode.accesstime.precision' may be set to 0 on HDFS.", remotePath ); 588 589 if( localFileStatus != null ) 590 timestampMap.put( remotePath.getName(), localFileStatus.getModificationTime() ); 591 } 592 } 593 594 return timestampMap; 595 } 596 597 public static Map<Path, Path> getCommonPaths( Map<String, Path> localPaths, Map<String, Path> remotePaths ) 598 { 599 Map<Path, Path> commonPaths = new HashMap<Path, Path>(); 600 601 for( Map.Entry<String, Path> entry : localPaths.entrySet() ) 602 { 603 if( remotePaths.containsKey( entry.getKey() ) ) 604 commonPaths.put( entry.getValue(), remotePaths.get( entry.getKey() ) ); 605 } 606 607 return commonPaths; 608 } 609 610 private static Map<Path, Path> getCopyPaths( Configuration config, Map<Path, Path> commonPaths ) 611 { 612 Map<Path, Path> copyPaths = new HashMap<Path, Path>(); 613 614 FileSystem remoteFS = getDefaultFS( config ); 615 FileSystem localFS = getLocalFS( config ); 616 617 for( Map.Entry<Path, Path> entry : commonPaths.entrySet() ) 618 { 619 Path localPath = entry.getKey(); 620 Path remotePath = entry.getValue(); 621 622 try 623 { 624 boolean localExists = localFS.exists( localPath ); 625 boolean remoteExist = remoteFS.exists( remotePath ); 626 627 if( localExists && !remoteExist ) 628 { 629 copyPaths.put( localPath, remotePath ); 630 } 631 else if( localExists ) 632 { 633 long localModTime = localFS.getFileStatus( localPath ).getModificationTime(); 634 long remoteModTime = remoteFS.getFileStatus( remotePath ).getModificationTime(); 635 636 if( localModTime > remoteModTime ) 637 copyPaths.put( localPath, remotePath ); 638 } 639 } 640 catch( IOException exception ) 641 { 642 throw new FlowException( "unable to get handle to underlying filesystem", exception ); 643 } 644 } 645 646 return copyPaths; 647 } 648 649 public static void resolvePaths( Configuration config, Collection<String> classpath, String remoteRoot, String resourceSubPath, Map<String, Path> localPaths, Map<String, Path> remotePaths ) 650 { 651 FileSystem defaultFS = getDefaultFS( config ); 652 FileSystem localFS = getLocalFS( config ); 653 654 Path remoteRootPath = new Path( remoteRoot == null ? "./.staging" : remoteRoot ); 655 656 if( resourceSubPath != null ) 657 remoteRootPath = new Path( remoteRootPath, resourceSubPath ); 658 659 remoteRootPath = defaultFS.makeQualified( remoteRootPath ); 660 661 boolean defaultIsLocal = defaultFS.equals( localFS ); 662 663 for( String stringPath : classpath ) 664 { 665 Path path = new Path( stringPath ); 666 667 URI uri = path.toUri(); 668 669 if( uri.getScheme() == null && !defaultIsLocal ) // we want to sync 670 { 671 Path localPath = localFS.makeQualified( path ); 672 673 if( !exists( localFS, localPath ) ) 674 throw new FlowException( "path not found: " + localPath ); 675 676 String name = localPath.getName(); 677 678 if( resourceSubPath != null ) 679 name = resourceSubPath + "/" + name; 680 681 localPaths.put( name, localPath ); 682 remotePaths.put( name, defaultFS.makeQualified( new Path( remoteRootPath, path.getName() ) ) ); 683 } 684 else if( localFS.equals( getFileSystem( config, path ) ) ) 685 { 686 if( !exists( localFS, path ) ) 687 throw new FlowException( "path not found: " + path ); 688 689 Path localPath = localFS.makeQualified( path ); 690 691 String name = localPath.getName(); 692 693 if( resourceSubPath != null ) 694 name = resourceSubPath + "/" + name; 695 696 localPaths.put( name, localPath ); 697 } 698 else 699 { 700 if( !exists( defaultFS, path ) ) 701 throw new FlowException( "path not found: " + path ); 702 703 Path defaultPath = defaultFS.makeQualified( path ); 704 705 String name = defaultPath.getName(); 706 707 if( resourceSubPath != null ) 708 name = resourceSubPath + "/" + name; 709 710 remotePaths.put( name, defaultPath ); 711 } 712 } 713 } 714 715 private static boolean exists( FileSystem fileSystem, Path path ) 716 { 717 try 718 { 719 return fileSystem.exists( path ); 720 } 721 catch( IOException exception ) 722 { 723 throw new FlowException( "could not test file exists: " + path ); 724 } 725 } 726 727 private static FileSystem getFileSystem( Configuration config, Path path ) 728 { 729 try 730 { 731 return path.getFileSystem( config ); 732 } 733 catch( IOException exception ) 734 { 735 throw new FlowException( "unable to get handle to underlying filesystem", exception ); 736 } 737 } 738 739 public static LocalFileSystem getLocalFS( Configuration config ) 740 { 741 try 742 { 743 return FileSystem.getLocal( config ); 744 } 745 catch( IOException exception ) 746 { 747 throw new FlowException( "unable to get handle to underlying filesystem", exception ); 748 } 749 } 750 751 public static FileSystem getDefaultFS( Configuration config ) 752 { 753 try 754 { 755 return FileSystem.get( config ); 756 } 757 catch( IOException exception ) 758 { 759 throw new FlowException( "unable to get handle to underlying filesystem", exception ); 760 } 761 } 762 763 public static boolean isLocal( Configuration conf ) 764 { 765 // hadoop 1.0 and 2.0 use different properties to define local mode: we check the new YARN 766 // property first 767 String frameworkName = conf.get( "mapreduce.framework.name" ); 768 769 // we are running on hadoop 2.0 (YARN) 770 if( frameworkName != null ) 771 return frameworkName.equals( "local" ); 772 773 // for Tez 774 String tezLocal = conf.get( "tez.local.mode" ); 775 776 if( tezLocal != null ) 777 return tezLocal.equals( "true" ); 778 779 // hadoop 1.0: use the old property to determine the local mode 780 String hadoop1 = conf.get( "mapred.job.tracker" ); 781 782 if( hadoop1 == null ) 783 { 784 LOG.warn( "could not successfully test if Hadoop based platform is in standalone/local mode, no valid properties set, returning false - tests for: mapreduce.framework.name, tez.local.mode, and mapred.job.tracker" ); 785 return false; 786 } 787 788 return hadoop1.equals( "local" ); 789 } 790 791 public static boolean isYARN( Configuration conf ) 792 { 793 return conf.get( "mapreduce.framework.name" ) != null; 794 } 795 796 public static void setLocal( Configuration conf ) 797 { 798 // set both properties to local 799 conf.set( "mapred.job.tracker", "local" ); 800 801 // yarn 802 conf.set( "mapreduce.framework.name", "local" ); 803 804 // tez 805 conf.set( "tez.local.mode", "true" ); 806 conf.set( "tez.runtime.optimize.local.fetch", "true" ); 807 } 808 809 private static boolean interfaceAssignableFromClassName( Class<?> xface, String className ) 810 { 811 if( ( className == null ) || ( xface == null ) ) 812 return false; 813 814 try 815 { 816 Class<?> klass = Class.forName( className ); 817 if( klass == null ) 818 return false; 819 820 if( !xface.isAssignableFrom( klass ) ) 821 return false; 822 823 return true; 824 } 825 catch( ClassNotFoundException cnfe ) 826 { 827 return false; // let downstream figure it out 828 } 829 } 830 831 public static boolean setNewApi( Configuration conf, String className ) 832 { 833 if( className == null ) // silently return and let the error be caught downstream 834 return false; 835 836 boolean isStable = className.startsWith( "org.apache.hadoop.mapred." ) 837 || interfaceAssignableFromClassName( org.apache.hadoop.mapred.InputFormat.class, className ); 838 839 boolean isNew = className.startsWith( "org.apache.hadoop.mapreduce." ) 840 || interfaceAssignableFromClassName( org.apache.hadoop.mapreduce.InputFormat.class, className ); 841 842 if( isStable ) 843 conf.setBoolean( "mapred.mapper.new-api", false ); 844 else if( isNew ) 845 conf.setBoolean( "mapred.mapper.new-api", true ); 846 else 847 throw new IllegalStateException( "cannot determine if class denotes stable or new api, please set 'mapred.mapper.new-api' to the appropriate value" ); 848 849 return true; 850 } 851 852 public static void addInputPaths( Configuration conf, Iterable<Path> paths ) 853 { 854 Path workingDirectory = getWorkingDirectory( conf ); 855 String dirs = conf.get( "mapred.input.dir" ); 856 StringBuilder buffer = new StringBuilder( dirs == null ? "" : dirs ); 857 858 for( Path path : paths ) 859 { 860 if( !path.isAbsolute() ) 861 path = new Path( workingDirectory, path ); 862 863 String dirStr = StringUtils.escapeString( path.toString() ); 864 865 if( buffer.length() != 0 ) 866 buffer.append( ',' ); 867 868 buffer.append( dirStr ); 869 } 870 871 conf.set( "mapred.input.dir", buffer.toString() ); 872 } 873 874 public static void addInputPath( Configuration conf, Path path ) 875 { 876 Path workingDirectory = getWorkingDirectory( conf ); 877 path = new Path( workingDirectory, path ); 878 String dirStr = StringUtils.escapeString( path.toString() ); 879 String dirs = conf.get( "mapred.input.dir" ); 880 conf.set( "mapred.input.dir", dirs == null ? dirStr : 881 dirs + StringUtils.COMMA_STR + dirStr ); 882 } 883 884 public static void setOutputPath( Configuration conf, Path path ) 885 { 886 Path workingDirectory = getWorkingDirectory( conf ); 887 path = new Path( workingDirectory, path ); 888 conf.set( "mapred.output.dir", path.toString() ); 889 } 890 891 private static Path getWorkingDirectory( Configuration conf ) 892 { 893 String name = conf.get( "mapred.working.dir" ); 894 if( name != null ) 895 { 896 return new Path( name ); 897 } 898 else 899 { 900 try 901 { 902 Path dir = FileSystem.get( conf ).getWorkingDirectory(); 903 conf.set( "mapred.working.dir", dir.toString() ); 904 return dir; 905 } 906 catch( IOException e ) 907 { 908 throw new RuntimeException( e ); 909 } 910 } 911 } 912 913 public static Path getOutputPath( Configuration conf ) 914 { 915 String name = conf.get( "mapred.output.dir" ); 916 return name == null ? null : new Path( name ); 917 } 918 919 public static String pack( Object object, Configuration conf ) 920 { 921 if( object == null ) 922 return ""; 923 924 try 925 { 926 return serializeBase64( object, conf, true ); 927 } 928 catch( IOException exception ) 929 { 930 throw new FlowException( "unable to pack object: " + object.getClass().getCanonicalName(), exception ); 931 } 932 } 933 934 public static void addFields( Configuration conf, String property, Map<Integer, Fields> fields ) 935 { 936 if( fields == null || fields.isEmpty() ) 937 return; 938 939 Map<String, Fields> toPack = new HashMap<>(); 940 941 for( Map.Entry<Integer, Fields> entry : fields.entrySet() ) 942 toPack.put( entry.getKey().toString(), entry.getValue() ); 943 944 conf.set( property, pack( toPack, conf ) ); 945 } 946 947 public static Map<Integer, Fields> getFields( Configuration conf, String property ) throws IOException 948 { 949 String value = conf.getRaw( property ); 950 951 if( value == null || value.isEmpty() ) 952 return Collections.emptyMap(); 953 954 Map<String, Fields> map = deserializeBase64( value, conf, Map.class, true ); 955 Map<Integer, Fields> result = new HashMap<>(); 956 957 for( Map.Entry<String, Fields> entry : map.entrySet() ) 958 result.put( Integer.parseInt( entry.getKey() ), entry.getValue() ); 959 960 return result; 961 } 962 963 public static void addComparators( Configuration conf, String property, Map<String, Fields> map, BaseFlowStep flowStep, Group group ) 964 { 965 Iterator<Fields> fieldsIterator = map.values().iterator(); 966 967 if( !fieldsIterator.hasNext() ) 968 return; 969 970 Fields fields = fieldsIterator.next(); 971 972 if( fields.hasComparators() ) 973 { 974 conf.set( property, pack( fields, conf ) ); 975 return; 976 } 977 978 // use resolved fields if there are no comparators. 979 Set<Scope> previousScopes = flowStep.getPreviousScopes( group ); 980 981 fields = previousScopes.iterator().next().getOutValuesFields(); 982 983 if( fields.size() != 0 ) // allows fields.UNKNOWN to be used 984 conf.setInt( property + ".size", fields.size() ); 985 } 986 987 public static void addComparators( Configuration conf, String property, Map<String, Fields> map, Fields resolvedFields ) 988 { 989 Iterator<Fields> fieldsIterator = map.values().iterator(); 990 991 if( !fieldsIterator.hasNext() ) 992 return; 993 994 while( fieldsIterator.hasNext() ) 995 { 996 Fields fields = fieldsIterator.next(); 997 998 if( fields.hasComparators() ) 999 { 1000 conf.set( property, pack( fields, conf ) ); 1001 return; 1002 } 1003 } 1004 1005 if( resolvedFields.size() != 0 ) // allows fields.UNKNOWN to be used 1006 conf.setInt( property + ".size", resolvedFields.size() ); 1007 } 1008 }