001/* 002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.tap.hadoop.io; 023 024import java.io.FileNotFoundException; 025import java.io.IOException; 026import java.net.HttpURLConnection; 027import java.net.URI; 028import java.net.URISyntaxException; 029import java.net.URL; 030 031import org.apache.hadoop.conf.Configuration; 032import org.apache.hadoop.fs.FSDataInputStream; 033import org.apache.hadoop.fs.FileStatus; 034import org.apache.hadoop.fs.FileSystem; 035import org.apache.hadoop.fs.Path; 036import org.apache.hadoop.fs.PathFilter; 037import org.slf4j.Logger; 038import org.slf4j.LoggerFactory; 039 040/** 041 * Class HttpFileSystem provides a basic read-only {@link FileSystem} for accessing remote HTTP and HTTPS data. 042 * <p> 043 * To use this FileSystem, just use regular http:// or https:// URLs. 044 */ 045public class HttpFileSystem extends StreamedFileSystem 046 { 047 /** Field LOG */ 048 private static final Logger LOG = LoggerFactory.getLogger( HttpFileSystem.class ); 049 050 /** Field HTTP_SCHEME */ 051 public static final String HTTP_SCHEME = "http"; 052 /** Field HTTPS_SCHEME */ 053 public static final String HTTPS_SCHEME = "https"; 054 055 static 056 { 057 HttpURLConnection.setFollowRedirects( true ); 058 } 059 060 /** Field scheme */ 061 private String scheme; 062 /** Field authority */ 063 private String authority; 064 065 @Override 066 public void initialize( URI uri, Configuration configuration ) throws IOException 067 { 068 setConf( configuration ); 069 070 scheme = uri.getScheme(); 071 authority = uri.getAuthority(); 072 } 073 074 @Override 075 public URI getUri() 076 { 077 try 078 { 079 return new URI( scheme, authority, null, null, null ); 080 } 081 catch( URISyntaxException exception ) 082 { 083 throw new RuntimeException( "failed parsing uri", exception ); 084 } 085 } 086 087 @Override 088 public FileStatus[] globStatus( Path path, PathFilter pathFilter ) throws IOException 089 { 090 FileStatus fileStatus = getFileStatus( path ); 091 092 if( fileStatus == null ) 093 return null; 094 095 return new FileStatus[]{fileStatus}; 096 } 097 098 @Override 099 public FSDataInputStream open( Path path, int i ) throws IOException 100 { 101 URL url = makeUrl( path ); 102 103 HttpURLConnection connection = (HttpURLConnection) url.openConnection(); 104 connection.setRequestMethod( "GET" ); 105 connection.connect(); 106 107 debugConnection( connection ); 108 109 return new FSDataInputStream( new FSDigestInputStream( connection.getInputStream(), getMD5SumFor( getConf(), path ) ) ); 110 } 111 112 @Override 113 public boolean exists( Path path ) throws IOException 114 { 115 URL url = makeUrl( path ); 116 117 HttpURLConnection connection = (HttpURLConnection) url.openConnection(); 118 connection.setRequestMethod( "HEAD" ); 119 connection.connect(); 120 121 debugConnection( connection ); 122 123 return connection.getResponseCode() == 200; 124 } 125 126 @Override 127 public FileStatus getFileStatus( Path path ) throws IOException 128 { 129 URL url = makeUrl( path ); 130 131 HttpURLConnection connection = (HttpURLConnection) url.openConnection(); 132 connection.setRequestMethod( "HEAD" ); 133 connection.connect(); 134 135 debugConnection( connection ); 136 137 if( connection.getResponseCode() != 200 ) 138 throw new FileNotFoundException( "could not find file: " + path ); 139 140 long length = connection.getHeaderFieldInt( "Content-Length", 0 ); 141 142 length = length < 0 ? 0 : length; // queries may return -1 143 144 long modified = connection.getHeaderFieldDate( "Last-Modified", System.currentTimeMillis() ); 145 146 return new FileStatus( length, false, 1, getDefaultBlockSize(), modified, path ); 147 } 148 149 private void debugConnection( HttpURLConnection connection ) throws IOException 150 { 151 if( LOG.isDebugEnabled() ) 152 { 153 LOG.debug( "connection.getURL() = {}", connection.getURL() ); 154 LOG.debug( "connection.getRequestMethod() = {}", connection.getRequestMethod() ); 155 LOG.debug( "connection.getResponseCode() = {}", connection.getResponseCode() ); 156 LOG.debug( "connection.getResponseMessage() = {}", connection.getResponseMessage() ); 157 LOG.debug( "connection.getContentLength() = {}", connection.getContentLength() ); 158 } 159 } 160 161 private URL makeUrl( Path path ) throws IOException 162 { 163 if( path.toString().startsWith( scheme ) ) 164 return URI.create( path.toString() ).toURL(); 165 166 try 167 { 168 return new URI( scheme, authority, path.toString(), null, null ).toURL(); 169 } 170 catch( URISyntaxException exception ) 171 { 172 throw new IOException( exception.getMessage() ); 173 } 174 } 175 }