001/*
002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.tap.hadoop.io;
023
024import java.io.FileNotFoundException;
025import java.io.IOException;
026import java.net.HttpURLConnection;
027import java.net.URI;
028import java.net.URISyntaxException;
029import java.net.URL;
030
031import org.apache.hadoop.conf.Configuration;
032import org.apache.hadoop.fs.FSDataInputStream;
033import org.apache.hadoop.fs.FileStatus;
034import org.apache.hadoop.fs.FileSystem;
035import org.apache.hadoop.fs.Path;
036import org.apache.hadoop.fs.PathFilter;
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039
040/**
041 * Class HttpFileSystem provides a basic read-only {@link FileSystem} for accessing remote HTTP and HTTPS data.
042 * <p>
043 * To use this FileSystem, just use regular http:// or https:// URLs.
044 */
045public class HttpFileSystem extends StreamedFileSystem
046  {
047  /** Field LOG */
048  private static final Logger LOG = LoggerFactory.getLogger( HttpFileSystem.class );
049
050  /** Field HTTP_SCHEME */
051  public static final String HTTP_SCHEME = "http";
052  /** Field HTTPS_SCHEME */
053  public static final String HTTPS_SCHEME = "https";
054
055  static
056    {
057    HttpURLConnection.setFollowRedirects( true );
058    }
059
060  /** Field scheme */
061  private String scheme;
062  /** Field authority */
063  private String authority;
064
065  @Override
066  public void initialize( URI uri, Configuration configuration ) throws IOException
067    {
068    setConf( configuration );
069
070    scheme = uri.getScheme();
071    authority = uri.getAuthority();
072    }
073
074  @Override
075  public URI getUri()
076    {
077    try
078      {
079      return new URI( scheme, authority, null, null, null );
080      }
081    catch( URISyntaxException exception )
082      {
083      throw new RuntimeException( "failed parsing uri", exception );
084      }
085    }
086
087  @Override
088  public FileStatus[] globStatus( Path path, PathFilter pathFilter ) throws IOException
089    {
090    FileStatus fileStatus = getFileStatus( path );
091
092    if( fileStatus == null )
093      return null;
094
095    return new FileStatus[]{fileStatus};
096    }
097
098  @Override
099  public FSDataInputStream open( Path path, int i ) throws IOException
100    {
101    URL url = makeUrl( path );
102
103    HttpURLConnection connection = (HttpURLConnection) url.openConnection();
104    connection.setRequestMethod( "GET" );
105    connection.connect();
106
107    debugConnection( connection );
108
109    return new FSDataInputStream( new FSDigestInputStream( connection.getInputStream(), getMD5SumFor( getConf(), path ) ) );
110    }
111
112  @Override
113  public boolean exists( Path path ) throws IOException
114    {
115    URL url = makeUrl( path );
116
117    HttpURLConnection connection = (HttpURLConnection) url.openConnection();
118    connection.setRequestMethod( "HEAD" );
119    connection.connect();
120
121    debugConnection( connection );
122
123    return connection.getResponseCode() == 200;
124    }
125
126  @Override
127  public FileStatus getFileStatus( Path path ) throws IOException
128    {
129    URL url = makeUrl( path );
130
131    HttpURLConnection connection = (HttpURLConnection) url.openConnection();
132    connection.setRequestMethod( "HEAD" );
133    connection.connect();
134
135    debugConnection( connection );
136
137    if( connection.getResponseCode() != 200 )
138      throw new FileNotFoundException( "could not find file: " + path );
139
140    long length = connection.getHeaderFieldInt( "Content-Length", 0 );
141
142    length = length < 0 ? 0 : length; // queries may return -1
143
144    long modified = connection.getHeaderFieldDate( "Last-Modified", System.currentTimeMillis() );
145
146    return new FileStatus( length, false, 1, getDefaultBlockSize(), modified, path );
147    }
148
149  private void debugConnection( HttpURLConnection connection ) throws IOException
150    {
151    if( LOG.isDebugEnabled() )
152      {
153      LOG.debug( "connection.getURL() = {}", connection.getURL() );
154      LOG.debug( "connection.getRequestMethod() = {}", connection.getRequestMethod() );
155      LOG.debug( "connection.getResponseCode() = {}", connection.getResponseCode() );
156      LOG.debug( "connection.getResponseMessage() = {}", connection.getResponseMessage() );
157      LOG.debug( "connection.getContentLength() = {}", connection.getContentLength() );
158      }
159    }
160
161  private URL makeUrl( Path path ) throws IOException
162    {
163    if( path.toString().startsWith( scheme ) )
164      return URI.create( path.toString() ).toURL();
165
166    try
167      {
168      return new URI( scheme, authority, path.toString(), null, null ).toURL();
169      }
170    catch( URISyntaxException exception )
171      {
172      throw new IOException( exception.getMessage() );
173      }
174    }
175  }