001/* 002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 * 020 */ 021 022package cascading.tap.local; 023 024import java.io.IOException; 025import java.io.InputStream; 026import java.io.OutputStream; 027import java.nio.file.DirectoryStream; 028import java.nio.file.FileSystem; 029import java.nio.file.Files; 030import java.nio.file.Path; 031import java.nio.file.PathMatcher; 032import java.nio.file.Paths; 033import java.util.HashSet; 034import java.util.Iterator; 035import java.util.LinkedHashSet; 036import java.util.Properties; 037import java.util.Set; 038import java.util.stream.Stream; 039 040import cascading.flow.FlowProcess; 041import cascading.scheme.FileFormat; 042import cascading.scheme.Scheme; 043import cascading.tap.SinkMode; 044import cascading.tap.TapException; 045import cascading.tuple.TupleEntryIterator; 046import cascading.tuple.TupleEntrySchemeIterator; 047import cascading.util.CloseableIterator; 048import org.slf4j.Logger; 049import org.slf4j.LoggerFactory; 050 051/** 052 * Class DirTap processes all files in the given directory that match the given glob or regex. 053 * <p> 054 * A DirTap can be used as a sink or a source. 055 * <p> 056 * When used as a source, the given pattern and depth are used to identify input files from the filesystem. 057 * <p> 058 * When used as a sink, a single file is created in the directory for all the output data. The file is name is 059 * returned by {@link #getOutputIdentifier()} and can be overridden (see {@link #getOutputFilename()} and 060 * {@link #getOutputFileBasename()}). 061 * <p> 062 * When deleting the resource identified by this Tap, the value of {@link #getOutputIdentifier()} will 063 * be deleted, if it exists. 064 * <p> 065 * DirTap must be used with the {@link cascading.flow.local.LocalFlowConnector} to create 066 * {@link cascading.flow.Flow} instances that run in "local" mode. 067 * <p> 068 * The given pattern must adhere to the syntax supported by {@link FileSystem#getPathMatcher(String)}. 069 * <p> 070 * Or a sub-class may override {@link #getPathMatcher()} and return a custom matcher. 071 * <p> 072 * The maxDepth parameter is the maximum number of levels of directories to visit. A value of 0 means that only the 073 * starting file is visited, unless denied by the security manager. 074 * <p> 075 * A value of MAX_VALUE (the default) may be used to indicate that all levels should be visited. 076 */ 077public class DirTap extends FileTap 078 { 079 private static final Logger LOG = LoggerFactory.getLogger( DirTap.class ); 080 081 int maxDepth = Integer.MAX_VALUE; 082 String pattern; 083 084 /** 085 * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme} and file {@code directory}. 086 * 087 * @param scheme of type Scheme 088 * @param directory of type String 089 */ 090 public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String directory ) 091 { 092 super( scheme, directory ); 093 094 verify(); 095 } 096 097 /** 098 * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory}, 099 * and {@code pattern}. 100 * 101 * @param scheme of type Scheme 102 * @param directory of type String 103 * @param pattern of type String 104 */ 105 public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String directory, String pattern ) 106 { 107 super( scheme, directory ); 108 this.pattern = pattern; 109 110 verify(); 111 } 112 113 /** 114 * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory}, 115 * {@code pattern}, and {@code maxDepth} 116 * 117 * @param scheme of type Scheme 118 * @param directory of type String 119 * @param pattern of type String 120 * @param maxDepth of type int 121 */ 122 public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String directory, String pattern, int maxDepth ) 123 { 124 super( scheme, directory ); 125 this.maxDepth = maxDepth; 126 this.pattern = pattern; 127 128 verify(); 129 } 130 131 /** 132 * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme} and file {@code directory}. 133 * 134 * @param scheme of type Scheme 135 * @param directory of type String 136 * @param sinkMode of type SinkMode 137 */ 138 public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String directory, SinkMode sinkMode ) 139 { 140 super( scheme, directory, sinkMode ); 141 142 verify(); 143 } 144 145 /** 146 * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory}, 147 * and {@code pattern}. 148 * 149 * @param scheme of type Scheme 150 * @param directory of type String 151 * @param pattern of type String 152 * @param sinkMode of type SinkMode 153 */ 154 public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String directory, String pattern, SinkMode sinkMode ) 155 { 156 super( scheme, directory, sinkMode ); 157 this.pattern = pattern; 158 159 verify(); 160 } 161 162 /** 163 * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory}, 164 * {@code pattern}, and {@code maxDepth} 165 * 166 * @param scheme of type Scheme 167 * @param directory of type String 168 * @param pattern of type String 169 * @param maxDepth of type int 170 * @param sinkMode of type SinkMode 171 */ 172 public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String directory, String pattern, int maxDepth, SinkMode sinkMode ) 173 { 174 super( scheme, directory, sinkMode ); 175 this.maxDepth = maxDepth; 176 this.pattern = pattern; 177 178 verify(); 179 } 180 181 /** 182 * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme} and file {@code directory}. 183 * 184 * @param scheme of type Scheme 185 * @param directory of type Path 186 */ 187 public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, Path directory ) 188 { 189 super( scheme, directory ); 190 191 verify(); 192 } 193 194 /** 195 * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory}, 196 * and {@code pattern}. 197 * 198 * @param scheme of type Scheme 199 * @param directory of type Path 200 * @param pattern of type String 201 */ 202 public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, Path directory, String pattern ) 203 { 204 super( scheme, directory ); 205 this.pattern = pattern; 206 207 verify(); 208 } 209 210 /** 211 * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory}, 212 * {@code pattern}, and {@code maxDepth} 213 * 214 * @param scheme of type Scheme 215 * @param directory of type Path 216 * @param pattern of type String 217 * @param maxDepth of type int 218 */ 219 public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, Path directory, String pattern, int maxDepth ) 220 { 221 super( scheme, directory ); 222 this.maxDepth = maxDepth; 223 this.pattern = pattern; 224 225 verify(); 226 } 227 228 /** 229 * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme} and file {@code directory}. 230 * 231 * @param scheme of type Scheme 232 * @param directory of type Path 233 * @param sinkMode of type SinkMode 234 */ 235 public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, Path directory, SinkMode sinkMode ) 236 { 237 super( scheme, directory, sinkMode ); 238 239 verify(); 240 } 241 242 /** 243 * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory}, 244 * and {@code pattern}. 245 * 246 * @param scheme of type Scheme 247 * @param directory of type Path 248 * @param pattern of type String 249 * @param sinkMode of type SinkMode 250 */ 251 public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, Path directory, String pattern, SinkMode sinkMode ) 252 { 253 super( scheme, directory, sinkMode ); 254 this.pattern = pattern; 255 256 verify(); 257 } 258 259 /** 260 * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory}, 261 * {@code pattern}, and {@code maxDepth} 262 * 263 * @param scheme of type Scheme 264 * @param directory of type Path 265 * @param pattern of type String 266 * @param maxDepth of type int 267 * @param sinkMode of type SinkMode 268 */ 269 public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, Path directory, String pattern, int maxDepth, SinkMode sinkMode ) 270 { 271 super( scheme, directory, sinkMode ); 272 this.maxDepth = maxDepth; 273 this.pattern = pattern; 274 275 verify(); 276 } 277 278 protected void verify() 279 { 280 super.verify(); 281 282 if( maxDepth < 0 ) 283 throw new IllegalArgumentException( "maxDepth must be greater than 0, given: " + maxDepth ); 284 285 try 286 { 287 getPathMatcher(); 288 } 289 catch( RuntimeException exception ) 290 { 291 throw new IllegalArgumentException( "could not parse pattern: " + getPattern(), exception ); 292 } 293 } 294 295 @Override 296 public String getOutputIdentifier() 297 { 298 return getPath().resolve( getOutputFilename() ).toString(); 299 } 300 301 public String getOutputFilename() 302 { 303 if( getScheme() instanceof FileFormat ) 304 return getOutputFileBasename() + "." + ( (FileFormat) getScheme() ).getExtension(); 305 306 return getOutputFileBasename() + ".tap"; 307 } 308 309 protected String getOutputFileBasename() 310 { 311 return "output"; 312 } 313 314 public String getPattern() 315 { 316 return pattern; 317 } 318 319 public int getMaxDepth() 320 { 321 return maxDepth; 322 } 323 324 @Override 325 public boolean deleteResource( Properties conf ) throws IOException 326 { 327 return deleteDirTap( this, conf ); 328 } 329 330 @Override 331 public TupleEntryIterator openForRead( FlowProcess<? extends Properties> flowProcess, InputStream input ) throws IOException 332 { 333 if( !Files.isDirectory( getPath() ) && getPattern() != null ) 334 throw new IllegalStateException( "a file pattern was provided and given path is not a directory: " + getPath() ); 335 336 if( !Files.isDirectory( getPath() ) ) 337 return super.openForRead( flowProcess, input ); 338 339 PathMatcher pathMatcher = getPathMatcher(); 340 341 CloseableIterator<InputStream> iterator = new CloseableIterator<InputStream>() 342 { 343 Stream<Path> stream = Files.walk( getPath(), maxDepth ) 344 .filter( path -> !Files.isDirectory( path ) ) 345 .filter( pathMatcher::matches ); 346 Iterator<Path> iterator = stream.iterator(); 347 InputStream lastInputStream = null; 348 349 @Override 350 public boolean hasNext() 351 { 352 return iterator.hasNext(); 353 } 354 355 @Override 356 public InputStream next() 357 { 358 safeClose(); 359 360 Path path = iterator.next(); 361 362 flowProcess.getFlowProcessContext().setSourcePath( path.toAbsolutePath().toString() ); 363 364 if( LOG.isDebugEnabled() ) 365 LOG.debug( "opening: {}", path ); 366 367 try 368 { 369 lastInputStream = Files.newInputStream( path ); 370 371 return lastInputStream; 372 } 373 catch( IOException exception ) 374 { 375 throw new TapException( "unable to open path: " + path, exception ); 376 } 377 } 378 379 private void safeClose() 380 { 381 try 382 { 383 if( lastInputStream != null ) 384 lastInputStream.close(); 385 386 lastInputStream = null; 387 } 388 catch( IOException exception ) 389 { 390 // do nothing 391 } 392 } 393 394 @Override 395 public void close() throws IOException 396 { 397 safeClose(); 398 399 if( stream != null ) 400 stream.close(); 401 } 402 }; 403 404 return new TupleEntrySchemeIterator<Properties, InputStream>( flowProcess, this, getScheme(), iterator, () -> flowProcess.getFlowProcessContext().getSourcePath() ); 405 } 406 407 @Override 408 public String[] getChildIdentifiers( Properties conf, int depth, boolean fullyQualified ) throws IOException 409 { 410 if( !resourceExists( conf ) ) 411 return new String[ 0 ]; 412 413 if( !Files.isDirectory( getPath() ) ) 414 throw new IllegalStateException( "given path is not a directory: " + getPath() ); 415 416 Set<String> results = new LinkedHashSet<String>(); 417 418 PathMatcher pathMatcher = getPathMatcher(); 419 420 try( final Stream<Path> pathStream = Files.walk( getPath(), depth ) ) 421 { 422 pathStream 423 .filter( path -> !Files.isDirectory( path ) ) 424 .filter( pathMatcher::matches ) 425 .forEach( path -> results.add( fullyQualified ? path.toAbsolutePath().toString() : path.toString() ) ); 426 } 427 428 return results.toArray( new String[ results.size() ] ); 429 } 430 431 protected PathMatcher getPathMatcher() 432 { 433 if( getPattern() == null ) 434 return path -> true; 435 436 FileSystem fileSystem = getPath().getFileSystem(); 437 438 return fileSystem.getPathMatcher( getPattern() ); 439 } 440 441 /** 442 * Method deleteDirTap will recursively delete all files referenced by the given DirTap. 443 * 444 * @param dirTap the directory to delete 445 */ 446 public static boolean deleteDirTap( DirTap dirTap, Properties conf ) throws IOException 447 { 448 deleteChildren( dirTap.getPath(), dirTap.getChildIdentifiers( conf ) ); 449 450 Files.deleteIfExists( dirTap.getPath() ); 451 452 return true; 453 } 454 455 /** 456 * Deletes the child files and their directories. Does not delete the parent path. 457 * 458 * @param parentPath 459 * @param childIdentifiers 460 * @throws IOException 461 */ 462 protected static void deleteChildren( Path parentPath, String[] childIdentifiers ) throws IOException 463 { 464 Set<Path> parents = new HashSet<>(); 465 466 for( String childIdentifier : childIdentifiers ) 467 { 468 Path path = Paths.get( childIdentifier ); 469 470 parents.add( parentPath.resolve( parentPath.relativize( path ).subpath( 0, 1 ) ) ); 471 } 472 473 for( Path subParent : parents ) 474 recursiveDelete( subParent ); 475 } 476 477 private static void recursiveDelete( Path path ) throws IOException 478 { 479 if( path == null ) 480 return; 481 482 if( Files.isDirectory( path ) ) 483 { 484 try( DirectoryStream<Path> paths = Files.newDirectoryStream( path ) ) 485 { 486 for( Path current : paths ) 487 recursiveDelete( current ); 488 } 489 } 490 491 Files.deleteIfExists( path ); 492 } 493 }