001/* 002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.tap.hadoop.io; 023 024import java.io.DataInput; 025import java.io.DataOutput; 026import java.io.IOException; 027import java.util.HashMap; 028import java.util.Map; 029 030import cascading.flow.hadoop.util.HadoopUtil; 031import cascading.tap.type.FileType; 032import org.apache.hadoop.fs.Path; 033import org.apache.hadoop.io.WritableUtils; 034import org.apache.hadoop.mapred.FileSplit; 035import org.apache.hadoop.mapred.InputSplit; 036import org.apache.hadoop.mapred.JobConf; 037import org.apache.hadoop.mapred.JobConfigurable; 038import org.apache.hadoop.util.ReflectionUtils; 039import org.slf4j.Logger; 040import org.slf4j.LoggerFactory; 041 042/** Class MultiInputSplit is used by MultiInputFormat */ 043public class MultiInputSplit implements InputSplit, JobConfigurable 044 { 045 /** 046 * @deprecated see {@link FileType#CASCADING_SOURCE_PATH}. 047 */ 048 @Deprecated 049 public static final String CASCADING_SOURCE_PATH = FileType.CASCADING_SOURCE_PATH; 050 private static final Logger LOG = LoggerFactory.getLogger( MultiInputSplit.class ); 051 052 /** Field jobConf */ 053 private transient JobConf jobConf; 054 /** Field inputSplit */ 055 InputSplit inputSplit; 056 /** Field config */ 057 Map<String, String> config; 058 059 /** 060 * Method getCurrentTapSourcePath finds and returns the current source Tap filename path, if any. 061 * <p> 062 * Use this method inside an Operation to find the current file being processed. 063 * 064 * @param jobConf 065 * @return a String 066 */ 067 public static String getCurrentTapSourcePath( JobConf jobConf ) 068 { 069 return jobConf.get( FileType.CASCADING_SOURCE_PATH ); 070 } 071 072 public MultiInputSplit( InputSplit inputSplit, Map<String, String> config ) 073 { 074 if( inputSplit == null ) 075 throw new IllegalArgumentException( "input split may not be null" ); 076 077 if( config == null ) 078 throw new IllegalArgumentException( "config may not be null" ); 079 080 this.inputSplit = inputSplit; 081 this.config = config; 082 } 083 084 /** 085 * This constructor is used internally by Hadoop. it is expected {@link #configure(org.apache.hadoop.mapred.JobConf)} 086 * and {@link #readFields(java.io.DataInput)} are called to properly initialize. 087 */ 088 public MultiInputSplit() 089 { 090 } 091 092 public void configure( JobConf jobConf ) 093 { 094 this.jobConf = jobConf; 095 } 096 097 public long getLength() throws IOException 098 { 099 return inputSplit.getLength(); 100 } 101 102 public String[] getLocations() throws IOException 103 { 104 return inputSplit.getLocations(); 105 } 106 107 public InputSplit getWrappedInputSplit() 108 { 109 return inputSplit; 110 } 111 112 public void write( DataOutput out ) throws IOException 113 { 114 out.writeUTF( inputSplit.getClass().getName() ); 115 116 String[] keys = config.keySet().toArray( new String[ config.size() ] ); 117 String[] values = new String[ keys.length ]; 118 119 for( int i = 0; i < keys.length; i++ ) 120 values[ i ] = config.get( keys[ i ] ); 121 122 WritableUtils.writeStringArray( out, keys ); 123 WritableUtils.writeStringArray( out, values ); 124 125 inputSplit.write( out ); 126 } 127 128 public void readFields( DataInput in ) throws IOException 129 { 130 String splitType = in.readUTF(); 131 config = new HashMap<String, String>(); 132 133 String[] keys = WritableUtils.readStringArray( in ); 134 String[] values = WritableUtils.readStringArray( in ); 135 136 for( int i = 0; i < keys.length; i++ ) 137 config.put( keys[ i ], values[ i ] ); 138 139 if( LOG.isDebugEnabled() ) 140 { 141 LOG.debug( "current split config diff:" ); 142 for( Map.Entry<String, String> entry : config.entrySet() ) 143 LOG.debug( "key: {}, value: {}", entry.getKey(), entry.getValue() ); 144 } 145 146 JobConf currentConf = HadoopUtil.mergeConf( jobConf, config, false ); 147 148 try 149 { 150 inputSplit = (InputSplit) ReflectionUtils.newInstance( currentConf.getClassByName( splitType ), currentConf ); 151 } 152 catch( ClassNotFoundException exp ) 153 { 154 throw new IOException( "split class " + splitType + " not found" ); 155 } 156 157 inputSplit.readFields( in ); 158 159 if( inputSplit instanceof FileSplit ) 160 { 161 Path path = ( (FileSplit) inputSplit ).getPath(); 162 163 if( path != null ) 164 { 165 jobConf.set( FileType.CASCADING_SOURCE_PATH, path.toString() ); 166 167 LOG.info( "current split input path: {}", path ); 168 } 169 } 170 } 171 }