001/*
002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.tap.local;
023
024import java.beans.ConstructorProperties;
025import java.io.FileInputStream;
026import java.io.FileNotFoundException;
027import java.io.IOException;
028import java.io.InputStream;
029import java.io.OutputStream;
030import java.nio.file.Paths;
031import java.util.Properties;
032
033import cascading.flow.FlowProcess;
034import cascading.tap.SinkMode;
035import cascading.tap.Tap;
036import cascading.tap.local.io.TapFileOutputStream;
037import cascading.tap.partition.BasePartitionTap;
038import cascading.tap.partition.Partition;
039import cascading.tuple.TupleEntrySchemeCollector;
040import cascading.tuple.TupleEntrySchemeIterator;
041
042/**
043 * Class PartitionTap can be used to write tuple streams out to files and sub-directories based on the values in the
044 * current {@link cascading.tuple.Tuple} instance.
045 * <p>
046 * The constructor takes a {@link cascading.tap.local.FileTap} {@link cascading.tap.Tap} and a {@link Partition}
047 * implementation. This allows Tuple values at given positions to be used as directory names during write
048 * operations, and directory names as data during read operations.
049 * <p>
050 * The key value here is that there is no need to duplicate data values in the directory names and inside
051 * the data files.
052 * <p>
053 * So only values declared in the parent Tap will be read or written to the underlying file system files. But
054 * fields declared by the {@link Partition} will only be read or written to the directory names. That is, the
055 * PartitionTap instance will sink or source the partition fields, plus the parent Tap fields. The partition
056 * fields and parent Tap fields do not need to have common field names.
057 * <p>
058 * {@code openWritesThreshold} limits the number of open files to be output to. This value defaults to 300 files.
059 * Each time the threshold is exceeded, 10% of the least recently used open files will be closed.
060 * <p>
061 * PartitionTap will populate a given {@code partition} without regard to case of the values being used. Thus
062 * the resulting paths {@code 2012/June/} and {@code 2012/june/} will likely result in two open files into the same
063 * location. Forcing the case to be consistent with a custom Partition implementation or an upstream
064 * {@link cascading.operation.Function} is recommended, see {@link cascading.operation.expression.ExpressionFunction}.
065 */
066public class PartitionTap extends BasePartitionTap<Properties, InputStream, OutputStream>
067  {
068  /**
069   * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the
070   * base path and default {@link cascading.scheme.Scheme}, and the partition.
071   *
072   * @param parent    of type Tap
073   * @param partition of type Partition
074   */
075  @ConstructorProperties({"parent", "partition"})
076  public PartitionTap( FileTap parent, Partition partition )
077    {
078    this( parent, partition, OPEN_WRITES_THRESHOLD_DEFAULT );
079    }
080
081  /**
082   * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the
083   * base path and default {@link cascading.scheme.Scheme}, and the partition.
084   * <p>
085   * {@code openWritesThreshold} limits the number of open files to be output to.
086   *
087   * @param parent              of type Hfs
088   * @param partition           of type Partition
089   * @param openWritesThreshold of type int
090   */
091  @ConstructorProperties({"parent", "partition", "openWritesThreshold"})
092  public PartitionTap( FileTap parent, Partition partition, int openWritesThreshold )
093    {
094    super( parent, partition, openWritesThreshold );
095    }
096
097  /**
098   * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the
099   * base path and default {@link cascading.scheme.Scheme}, and the partition.
100   *
101   * @param parent    of type Tap
102   * @param partition of type Partition
103   * @param sinkMode  of type SinkMode
104   */
105  @ConstructorProperties({"parent", "partition", "sinkMode"})
106  public PartitionTap( FileTap parent, Partition partition, SinkMode sinkMode )
107    {
108    super( parent, partition, sinkMode );
109    }
110
111  /**
112   * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the
113   * base path and default {@link cascading.scheme.Scheme}, and the partition.
114   * <p>
115   * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)}
116   * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
117   *
118   * @param parent             of type Tap
119   * @param partition          of type Partition
120   * @param sinkMode           of type SinkMode
121   * @param keepParentOnDelete of type boolean
122   */
123  @ConstructorProperties({"parent", "partition", "sinkMode", "keepParentOnDelete"})
124  public PartitionTap( FileTap parent, Partition partition, SinkMode sinkMode, boolean keepParentOnDelete )
125    {
126    this( parent, partition, sinkMode, keepParentOnDelete, OPEN_WRITES_THRESHOLD_DEFAULT );
127    }
128
129  /**
130   * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the
131   * base path and default {@link cascading.scheme.Scheme}, and the partition.
132   * <p>
133   * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)}
134   * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
135   * <p>
136   * {@code openWritesThreshold} limits the number of open files to be output to.
137   *
138   * @param parent              of type Tap
139   * @param partition           of type Partition
140   * @param sinkMode            of type SinkMode
141   * @param keepParentOnDelete  of type boolean
142   * @param openWritesThreshold of type int
143   */
144  @ConstructorProperties({"parent", "partition", "sinkMode", "keepParentOnDelete", "openWritesThreshold"})
145  public PartitionTap( FileTap parent, Partition partition, SinkMode sinkMode, boolean keepParentOnDelete, int openWritesThreshold )
146    {
147    super( parent, partition, sinkMode, keepParentOnDelete, openWritesThreshold );
148    }
149
150  @Override
151  protected String getCurrentIdentifier( FlowProcess<? extends Properties> flowProcess )
152    {
153    return null;
154    }
155
156  @Override
157  public boolean deleteResource( Properties conf ) throws IOException
158    {
159    String[] childIdentifiers = ( (FileTap) parent ).getChildIdentifiers( conf, Integer.MAX_VALUE, false );
160
161    if( childIdentifiers.length == 0 )
162      return deleteParent( conf );
163
164    DirTap.deleteChildren( Paths.get( parent.getIdentifier() ), childIdentifiers );
165
166    return deleteParent( conf );
167    }
168
169  private boolean deleteParent( Properties conf ) throws IOException
170    {
171    return keepParentOnDelete || parent.deleteResource( conf );
172    }
173
174  @Override
175  protected TupleEntrySchemeCollector createTupleEntrySchemeCollector( FlowProcess<? extends Properties> flowProcess, Tap parent, String path, long sequence ) throws IOException
176    {
177    TapFileOutputStream output = new TapFileOutputStream( parent, path, true ); // always append
178
179    return new TupleEntrySchemeCollector<Properties, OutputStream>( flowProcess, parent, output );
180    }
181
182  @Override
183  protected TupleEntrySchemeIterator createTupleEntrySchemeIterator( FlowProcess<? extends Properties> flowProcess, Tap parent, String path, InputStream input ) throws FileNotFoundException
184    {
185    if( input == null )
186      input = new FileInputStream( path );
187
188    return new TupleEntrySchemeIterator( flowProcess, parent, parent.getScheme(), input, path );
189    }
190  }