001/*
002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 *
004 * Project and contact information: http://www.cascading.org/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 *
020 */
021
022package cascading.tap.local;
023
024import java.io.IOException;
025import java.io.InputStream;
026import java.io.OutputStream;
027import java.nio.file.DirectoryStream;
028import java.nio.file.FileSystem;
029import java.nio.file.Files;
030import java.nio.file.Path;
031import java.nio.file.PathMatcher;
032import java.nio.file.Paths;
033import java.util.HashSet;
034import java.util.Iterator;
035import java.util.LinkedHashSet;
036import java.util.Properties;
037import java.util.Set;
038import java.util.stream.Stream;
039
040import cascading.flow.FlowProcess;
041import cascading.scheme.FileFormat;
042import cascading.scheme.Scheme;
043import cascading.tap.SinkMode;
044import cascading.tap.TapException;
045import cascading.tuple.TupleEntryIterator;
046import cascading.tuple.TupleEntrySchemeIterator;
047import cascading.util.CloseableIterator;
048import org.slf4j.Logger;
049import org.slf4j.LoggerFactory;
050
051/**
052 * Class DirTap processes all files in the given directory that match the given glob or regex.
053 * <p>
054 * A DirTap can be used as a sink or a source.
055 * <p>
056 * When used as a source, the given pattern and depth are used to identify input files from the filesystem.
057 * <p>
058 * When used as a sink, a single file is created in the directory for all the output data. The file is name is
059 * returned by {@link #getOutputIdentifier()} and can be overridden (see {@link #getOutputFilename()} and
060 * {@link #getOutputFileBasename()}).
061 * <p>
062 * When deleting the resource identified by this Tap, the value of {@link #getOutputIdentifier()} will
063 * be deleted, if it exists.
064 * <p>
065 * DirTap must be used with the {@link cascading.flow.local.LocalFlowConnector} to create
066 * {@link cascading.flow.Flow} instances that run in "local" mode.
067 * <p>
068 * The given pattern must adhere to the syntax supported by {@link FileSystem#getPathMatcher(String)}.
069 * <p>
070 * Or a sub-class may override {@link #getPathMatcher()} and return a custom matcher.
071 * <p>
072 * The maxDepth parameter is the maximum number of levels of directories to visit. A value of 0 means that only the
073 * starting file is visited, unless denied by the security manager.
074 * <p>
075 * A value of MAX_VALUE (the default) may be used to indicate that all levels should be visited.
076 */
077public class DirTap extends FileTap
078  {
079  private static final Logger LOG = LoggerFactory.getLogger( DirTap.class );
080
081  int maxDepth = Integer.MAX_VALUE;
082  String pattern;
083
084  /**
085   * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme} and file {@code directory}.
086   *
087   * @param scheme    of type Scheme
088   * @param directory of type String
089   */
090  public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String directory )
091    {
092    super( scheme, directory );
093
094    verify();
095    }
096
097  /**
098   * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory},
099   * and {@code pattern}.
100   *
101   * @param scheme    of type Scheme
102   * @param directory of type String
103   * @param pattern   of type String
104   */
105  public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String directory, String pattern )
106    {
107    super( scheme, directory );
108    this.pattern = pattern;
109
110    verify();
111    }
112
113  /**
114   * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory},
115   * {@code pattern}, and {@code maxDepth}
116   *
117   * @param scheme    of type Scheme
118   * @param directory of type String
119   * @param pattern   of type String
120   * @param maxDepth  of type int
121   */
122  public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String directory, String pattern, int maxDepth )
123    {
124    super( scheme, directory );
125    this.maxDepth = maxDepth;
126    this.pattern = pattern;
127
128    verify();
129    }
130
131  /**
132   * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme} and file {@code directory}.
133   *
134   * @param scheme    of type Scheme
135   * @param directory of type String
136   * @param sinkMode  of type SinkMode
137   */
138  public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String directory, SinkMode sinkMode )
139    {
140    super( scheme, directory, sinkMode );
141
142    verify();
143    }
144
145  /**
146   * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory},
147   * and {@code pattern}.
148   *
149   * @param scheme    of type Scheme
150   * @param directory of type String
151   * @param pattern   of type String
152   * @param sinkMode  of type SinkMode
153   */
154  public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String directory, String pattern, SinkMode sinkMode )
155    {
156    super( scheme, directory, sinkMode );
157    this.pattern = pattern;
158
159    verify();
160    }
161
162  /**
163   * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory},
164   * {@code pattern}, and {@code maxDepth}
165   *
166   * @param scheme    of type Scheme
167   * @param directory of type String
168   * @param pattern   of type String
169   * @param maxDepth  of type int
170   * @param sinkMode  of type SinkMode
171   */
172  public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String directory, String pattern, int maxDepth, SinkMode sinkMode )
173    {
174    super( scheme, directory, sinkMode );
175    this.maxDepth = maxDepth;
176    this.pattern = pattern;
177
178    verify();
179    }
180
181  /**
182   * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme} and file {@code directory}.
183   *
184   * @param scheme    of type Scheme
185   * @param directory of type Path
186   */
187  public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, Path directory )
188    {
189    super( scheme, directory );
190
191    verify();
192    }
193
194  /**
195   * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory},
196   * and {@code pattern}.
197   *
198   * @param scheme    of type Scheme
199   * @param directory of type Path
200   * @param pattern   of type String
201   */
202  public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, Path directory, String pattern )
203    {
204    super( scheme, directory );
205    this.pattern = pattern;
206
207    verify();
208    }
209
210  /**
211   * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory},
212   * {@code pattern}, and {@code maxDepth}
213   *
214   * @param scheme    of type Scheme
215   * @param directory of type Path
216   * @param pattern   of type String
217   * @param maxDepth  of type int
218   */
219  public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, Path directory, String pattern, int maxDepth )
220    {
221    super( scheme, directory );
222    this.maxDepth = maxDepth;
223    this.pattern = pattern;
224
225    verify();
226    }
227
228  /**
229   * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme} and file {@code directory}.
230   *
231   * @param scheme    of type Scheme
232   * @param directory of type Path
233   * @param sinkMode  of type SinkMode
234   */
235  public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, Path directory, SinkMode sinkMode )
236    {
237    super( scheme, directory, sinkMode );
238
239    verify();
240    }
241
242  /**
243   * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory},
244   * and {@code pattern}.
245   *
246   * @param scheme    of type Scheme
247   * @param directory of type Path
248   * @param pattern   of type String
249   * @param sinkMode  of type SinkMode
250   */
251  public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, Path directory, String pattern, SinkMode sinkMode )
252    {
253    super( scheme, directory, sinkMode );
254    this.pattern = pattern;
255
256    verify();
257    }
258
259  /**
260   * Constructor DirTap creates a new DirTap instance using the given {@link cascading.scheme.Scheme}, file {@code directory},
261   * {@code pattern}, and {@code maxDepth}
262   *
263   * @param scheme    of type Scheme
264   * @param directory of type Path
265   * @param pattern   of type String
266   * @param maxDepth  of type int
267   * @param sinkMode  of type SinkMode
268   */
269  public DirTap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, Path directory, String pattern, int maxDepth, SinkMode sinkMode )
270    {
271    super( scheme, directory, sinkMode );
272    this.maxDepth = maxDepth;
273    this.pattern = pattern;
274
275    verify();
276    }
277
278  protected void verify()
279    {
280    super.verify();
281
282    if( maxDepth < 0 )
283      throw new IllegalArgumentException( "maxDepth must be greater than 0, given: " + maxDepth );
284
285    try
286      {
287      getPathMatcher();
288      }
289    catch( RuntimeException exception )
290      {
291      throw new IllegalArgumentException( "could not parse pattern: " + getPattern(), exception );
292      }
293    }
294
295  @Override
296  public String getOutputIdentifier()
297    {
298    return getPath().resolve( getOutputFilename() ).toString();
299    }
300
301  public String getOutputFilename()
302    {
303    if( getScheme() instanceof FileFormat )
304      return getOutputFileBasename() + "." + ( (FileFormat) getScheme() ).getExtension();
305
306    return getOutputFileBasename() + ".tap";
307    }
308
309  protected String getOutputFileBasename()
310    {
311    return "output";
312    }
313
314  public String getPattern()
315    {
316    return pattern;
317    }
318
319  public int getMaxDepth()
320    {
321    return maxDepth;
322    }
323
324  @Override
325  public boolean deleteResource( Properties conf ) throws IOException
326    {
327    return deleteDirTap( this, conf );
328    }
329
330  @Override
331  public TupleEntryIterator openForRead( FlowProcess<? extends Properties> flowProcess, InputStream input ) throws IOException
332    {
333    if( !Files.isDirectory( getPath() ) && getPattern() != null )
334      throw new IllegalStateException( "a file pattern was provided and given path is not a directory: " + getPath() );
335
336    if( !Files.isDirectory( getPath() ) )
337      return super.openForRead( flowProcess, input );
338
339    PathMatcher pathMatcher = getPathMatcher();
340
341    CloseableIterator<InputStream> iterator = new CloseableIterator<InputStream>()
342      {
343      Stream<Path> stream = Files.walk( getPath(), maxDepth )
344        .filter( path -> !Files.isDirectory( path ) )
345        .filter( pathMatcher::matches );
346      Iterator<Path> iterator = stream.iterator();
347      InputStream lastInputStream = null;
348
349      @Override
350      public boolean hasNext()
351        {
352        return iterator.hasNext();
353        }
354
355      @Override
356      public InputStream next()
357        {
358        safeClose();
359
360        Path path = iterator.next();
361
362        flowProcess.getFlowProcessContext().setSourcePath( path.toAbsolutePath().toString() );
363
364        if( LOG.isDebugEnabled() )
365          LOG.debug( "opening: {}", path );
366
367        try
368          {
369          lastInputStream = Files.newInputStream( path );
370
371          return lastInputStream;
372          }
373        catch( IOException exception )
374          {
375          throw new TapException( "unable to open path: " + path, exception );
376          }
377        }
378
379      private void safeClose()
380        {
381        try
382          {
383          if( lastInputStream != null )
384            lastInputStream.close();
385
386          lastInputStream = null;
387          }
388        catch( IOException exception )
389          {
390          // do nothing
391          }
392        }
393
394      @Override
395      public void close() throws IOException
396        {
397        safeClose();
398
399        if( stream != null )
400          stream.close();
401        }
402      };
403
404    return new TupleEntrySchemeIterator<Properties, InputStream>( flowProcess, this, getScheme(), iterator, () -> flowProcess.getFlowProcessContext().getSourcePath() );
405    }
406
407  @Override
408  public String[] getChildIdentifiers( Properties conf, int depth, boolean fullyQualified ) throws IOException
409    {
410    if( !resourceExists( conf ) )
411      return new String[ 0 ];
412
413    if( !Files.isDirectory( getPath() ) )
414      throw new IllegalStateException( "given path is not a directory: " + getPath() );
415
416    Set<String> results = new LinkedHashSet<String>();
417
418    PathMatcher pathMatcher = getPathMatcher();
419
420    try( final Stream<Path> pathStream = Files.walk( getPath(), depth ) )
421      {
422      pathStream
423        .filter( path -> !Files.isDirectory( path ) )
424        .filter( pathMatcher::matches )
425        .forEach( path -> results.add( fullyQualified ? path.toAbsolutePath().toString() : path.toString() ) );
426      }
427
428    return results.toArray( new String[ results.size() ] );
429    }
430
431  protected PathMatcher getPathMatcher()
432    {
433    if( getPattern() == null )
434      return path -> true;
435
436    FileSystem fileSystem = getPath().getFileSystem();
437
438    return fileSystem.getPathMatcher( getPattern() );
439    }
440
441  /**
442   * Method deleteDirTap will recursively delete all files referenced by the given DirTap.
443   *
444   * @param dirTap the directory to delete
445   */
446  public static boolean deleteDirTap( DirTap dirTap, Properties conf ) throws IOException
447    {
448    deleteChildren( dirTap.getPath(), dirTap.getChildIdentifiers( conf ) );
449
450    Files.deleteIfExists( dirTap.getPath() );
451
452    return true;
453    }
454
455  /**
456   * Deletes the child files and their directories. Does not delete the parent path.
457   *
458   * @param parentPath
459   * @param childIdentifiers
460   * @throws IOException
461   */
462  protected static void deleteChildren( Path parentPath, String[] childIdentifiers ) throws IOException
463    {
464    Set<Path> parents = new HashSet<>();
465
466    for( String childIdentifier : childIdentifiers )
467      {
468      Path path = Paths.get( childIdentifier );
469
470      parents.add( parentPath.resolve( parentPath.relativize( path ).subpath( 0, 1 ) ) );
471      }
472
473    for( Path subParent : parents )
474      recursiveDelete( subParent );
475    }
476
477  private static void recursiveDelete( Path path ) throws IOException
478    {
479    if( path == null )
480      return;
481
482    if( Files.isDirectory( path ) )
483      {
484      try( DirectoryStream<Path> paths = Files.newDirectoryStream( path ) )
485        {
486        for( Path current : paths )
487          recursiveDelete( current );
488        }
489      }
490
491    Files.deleteIfExists( path );
492    }
493  }