001/* 002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.tap.local; 023 024import java.beans.ConstructorProperties; 025import java.io.FileInputStream; 026import java.io.FileNotFoundException; 027import java.io.IOException; 028import java.io.InputStream; 029import java.io.OutputStream; 030import java.nio.file.Paths; 031import java.util.Properties; 032 033import cascading.flow.FlowProcess; 034import cascading.tap.SinkMode; 035import cascading.tap.Tap; 036import cascading.tap.local.io.TapFileOutputStream; 037import cascading.tap.partition.BasePartitionTap; 038import cascading.tap.partition.Partition; 039import cascading.tuple.TupleEntrySchemeCollector; 040import cascading.tuple.TupleEntrySchemeIterator; 041 042/** 043 * Class PartitionTap can be used to write tuple streams out to files and sub-directories based on the values in the 044 * current {@link cascading.tuple.Tuple} instance. 045 * <p> 046 * The constructor takes a {@link cascading.tap.local.FileTap} {@link cascading.tap.Tap} and a {@link Partition} 047 * implementation. This allows Tuple values at given positions to be used as directory names during write 048 * operations, and directory names as data during read operations. 049 * <p> 050 * The key value here is that there is no need to duplicate data values in the directory names and inside 051 * the data files. 052 * <p> 053 * So only values declared in the parent Tap will be read or written to the underlying file system files. But 054 * fields declared by the {@link Partition} will only be read or written to the directory names. That is, the 055 * PartitionTap instance will sink or source the partition fields, plus the parent Tap fields. The partition 056 * fields and parent Tap fields do not need to have common field names. 057 * <p> 058 * {@code openWritesThreshold} limits the number of open files to be output to. This value defaults to 300 files. 059 * Each time the threshold is exceeded, 10% of the least recently used open files will be closed. 060 * <p> 061 * PartitionTap will populate a given {@code partition} without regard to case of the values being used. Thus 062 * the resulting paths {@code 2012/June/} and {@code 2012/june/} will likely result in two open files into the same 063 * location. Forcing the case to be consistent with a custom Partition implementation or an upstream 064 * {@link cascading.operation.Function} is recommended, see {@link cascading.operation.expression.ExpressionFunction}. 065 */ 066public class PartitionTap extends BasePartitionTap<Properties, InputStream, OutputStream> 067 { 068 /** 069 * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the 070 * base path and default {@link cascading.scheme.Scheme}, and the partition. 071 * 072 * @param parent of type Tap 073 * @param partition of type Partition 074 */ 075 @ConstructorProperties({"parent", "partition"}) 076 public PartitionTap( FileTap parent, Partition partition ) 077 { 078 this( parent, partition, OPEN_WRITES_THRESHOLD_DEFAULT ); 079 } 080 081 /** 082 * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the 083 * base path and default {@link cascading.scheme.Scheme}, and the partition. 084 * <p> 085 * {@code openWritesThreshold} limits the number of open files to be output to. 086 * 087 * @param parent of type Hfs 088 * @param partition of type Partition 089 * @param openWritesThreshold of type int 090 */ 091 @ConstructorProperties({"parent", "partition", "openWritesThreshold"}) 092 public PartitionTap( FileTap parent, Partition partition, int openWritesThreshold ) 093 { 094 super( parent, partition, openWritesThreshold ); 095 } 096 097 /** 098 * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the 099 * base path and default {@link cascading.scheme.Scheme}, and the partition. 100 * 101 * @param parent of type Tap 102 * @param partition of type Partition 103 * @param sinkMode of type SinkMode 104 */ 105 @ConstructorProperties({"parent", "partition", "sinkMode"}) 106 public PartitionTap( FileTap parent, Partition partition, SinkMode sinkMode ) 107 { 108 super( parent, partition, sinkMode ); 109 } 110 111 /** 112 * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the 113 * base path and default {@link cascading.scheme.Scheme}, and the partition. 114 * <p> 115 * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)} 116 * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}. 117 * 118 * @param parent of type Tap 119 * @param partition of type Partition 120 * @param sinkMode of type SinkMode 121 * @param keepParentOnDelete of type boolean 122 */ 123 @ConstructorProperties({"parent", "partition", "sinkMode", "keepParentOnDelete"}) 124 public PartitionTap( FileTap parent, Partition partition, SinkMode sinkMode, boolean keepParentOnDelete ) 125 { 126 this( parent, partition, sinkMode, keepParentOnDelete, OPEN_WRITES_THRESHOLD_DEFAULT ); 127 } 128 129 /** 130 * Constructor PartitionTap creates a new PartitionTap instance using the given parent {@link cascading.tap.local.FileTap} Tap as the 131 * base path and default {@link cascading.scheme.Scheme}, and the partition. 132 * <p> 133 * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)} 134 * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}. 135 * <p> 136 * {@code openWritesThreshold} limits the number of open files to be output to. 137 * 138 * @param parent of type Tap 139 * @param partition of type Partition 140 * @param sinkMode of type SinkMode 141 * @param keepParentOnDelete of type boolean 142 * @param openWritesThreshold of type int 143 */ 144 @ConstructorProperties({"parent", "partition", "sinkMode", "keepParentOnDelete", "openWritesThreshold"}) 145 public PartitionTap( FileTap parent, Partition partition, SinkMode sinkMode, boolean keepParentOnDelete, int openWritesThreshold ) 146 { 147 super( parent, partition, sinkMode, keepParentOnDelete, openWritesThreshold ); 148 } 149 150 @Override 151 protected String getCurrentIdentifier( FlowProcess<? extends Properties> flowProcess ) 152 { 153 return null; 154 } 155 156 @Override 157 public boolean deleteResource( Properties conf ) throws IOException 158 { 159 String[] childIdentifiers = ( (FileTap) parent ).getChildIdentifiers( conf, Integer.MAX_VALUE, false ); 160 161 if( childIdentifiers.length == 0 ) 162 return deleteParent( conf ); 163 164 DirTap.deleteChildren( Paths.get( parent.getIdentifier() ), childIdentifiers ); 165 166 return deleteParent( conf ); 167 } 168 169 private boolean deleteParent( Properties conf ) throws IOException 170 { 171 return keepParentOnDelete || parent.deleteResource( conf ); 172 } 173 174 @Override 175 protected TupleEntrySchemeCollector createTupleEntrySchemeCollector( FlowProcess<? extends Properties> flowProcess, Tap parent, String path, long sequence ) throws IOException 176 { 177 TapFileOutputStream output = new TapFileOutputStream( parent, path, true ); // always append 178 179 return new TupleEntrySchemeCollector<Properties, OutputStream>( flowProcess, parent, output ); 180 } 181 182 @Override 183 protected TupleEntrySchemeIterator createTupleEntrySchemeIterator( FlowProcess<? extends Properties> flowProcess, Tap parent, String path, InputStream input ) throws FileNotFoundException 184 { 185 if( input == null ) 186 input = new FileInputStream( path ); 187 188 return new TupleEntrySchemeIterator( flowProcess, parent, parent.getScheme(), input, path ); 189 } 190 }