001/* 002 * Copyright (c) 2016-2017 Chris K Wensel. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.tuple; 023 024import java.io.Closeable; 025import java.io.IOException; 026import java.util.function.Supplier; 027 028/** 029 * Interface TupleEntryCollector is used to allow {@link cascading.operation.BaseOperation} instances to emit 030 * one or more result {@link Tuple} values. 031 * <p> 032 * The general rule in Cascading is if you are handed a Tuple, you cannot change or cache it. Attempts at modifying 033 * such a Tuple will result in an Exception. Preventing caching is harder, see below. 034 * <p> 035 * If you create the Tuple, you can re-use or modify it. 036 * <p> 037 * When calling {@link #add(Tuple)} or {@link #add(TupleEntry)}, you are passing a Tuple to the down stream pipes and 038 * operations. Since no downstream operation may modify or cache the Tuple instance, it is safe to re-use the Tuple 039 * instance when {@code add()} returns. 040 * <p> 041 * That said, Tuple copies do get cached in order to perform specific operations in the underlying platforms. Currently 042 * only a shallow copy is made (via the {@link Tuple} copy constructor). Thus, any mutable type or collection 043 * placed inside a Tuple will not be copied, but will likely be cached if a copy of the Tuple passed downstream is 044 * copied. 045 * <p> 046 * So any subsequent changes to that nested type or collection will be reflected in the cached copy, a likely 047 * source of hard to find errors. 048 * <p> 049 * There is currently no way to specify that a deep copy must be performed when making a Tuple copy. 050 */ 051public abstract class TupleEntryCollector implements Closeable 052 { 053 protected TupleEntry tupleEntry = new TupleEntry( Fields.UNKNOWN, null, true ); 054 055 protected TupleEntryCollector() 056 { 057 } 058 059 /** 060 * Constructor TupleCollector creates a new TupleCollector instance. 061 * 062 * @param declared of type Fields 063 */ 064 public TupleEntryCollector( Fields declared ) 065 { 066 setFields( declared ); 067 } 068 069 public void setFields( Fields declared ) 070 { 071 if( declared == null ) 072 throw new IllegalArgumentException( "declared fields must not be null" ); 073 074 if( declared.isUnknown() || declared.isAll() ) 075 return; 076 077 // if operation declared ARGS, then the arguments are a selector and must be forced to declared 078 declared = Fields.asDeclaration( declared ); 079 080 this.tupleEntry = new TupleEntry( declared, Tuple.size( declared.size() ), true ); 081 } 082 083 /** 084 * Method addTupleEntry inserts the given {@link TupleEntry} into the outgoing stream. Note the method {@link #add(Tuple)} is 085 * more efficient as it simply calls {@link TupleEntry#getTuple()}; 086 * <p> 087 * See {@link cascading.tuple.TupleEntryCollector} on when and how to re-use a Tuple instance. 088 * 089 * @param supplier of type Supplier 090 */ 091 public void addTupleEntry( Supplier<TupleEntry> supplier ) 092 { 093 add( supplier.get() ); 094 } 095 096 /** 097 * Method add inserts the given {@link TupleEntry} into the outgoing stream. Note the method {@link #add(Tuple)} is 098 * more efficient as it simply calls {@link TupleEntry#getTuple()}; 099 * <p> 100 * See {@link cascading.tuple.TupleEntryCollector} on when and how to re-use a Tuple instance. 101 * 102 * @param tupleEntry of type TupleEntry 103 */ 104 public void add( TupleEntry tupleEntry ) 105 { 106 Fields expectedFields = this.tupleEntry.getFields(); 107 TupleEntry outgoingEntry = this.tupleEntry; 108 109 if( expectedFields.isUnknown() || expectedFields.equals( tupleEntry.getFields() ) ) 110 outgoingEntry = tupleEntry; 111 else 112 outgoingEntry.setTuple( selectTupleFrom( tupleEntry, expectedFields ) ); 113 114 safeCollect( outgoingEntry ); 115 } 116 117 private Tuple selectTupleFrom( TupleEntry tupleEntry, Fields expectedFields ) 118 { 119 try 120 { 121 return tupleEntry.selectTuple( expectedFields ); 122 } 123 catch( TupleException exception ) 124 { 125 Fields givenFields = tupleEntry.getFields(); 126 String string = "given TupleEntry fields: " + givenFields.printVerbose(); 127 string += " do not match the operation declaredFields: " + expectedFields.printVerbose(); 128 string += ", operations must emit tuples that match the fields they declare as output"; 129 130 throw new TupleException( string, exception ); 131 } 132 } 133 134 /** 135 * Method addTuple inserts the given {@link Tuple} into the outgoing stream. 136 * <p> 137 * See {@link cascading.tuple.TupleEntryCollector} on when and how to re-use a Tuple instance. 138 * 139 * @param supplier of type Supplier 140 */ 141 public void addTuple( Supplier<Tuple> supplier ) 142 { 143 add( supplier.get() ); 144 } 145 146 /** 147 * Method add inserts the given {@link Tuple} into the outgoing stream. 148 * <p> 149 * See {@link cascading.tuple.TupleEntryCollector} on when and how to re-use a Tuple instance. 150 * 151 * @param tuple of type Tuple 152 */ 153 public void add( Tuple tuple ) 154 { 155 if( !tupleEntry.getFields().isUnknown() && tupleEntry.getFields().size() != tuple.size() ) 156 throw new TupleException( "operation added the wrong number of fields, expected: " + tupleEntry.getFields().print() + ", got result size: " + tuple.size() ); 157 158 boolean isUnmodifiable = tuple.isUnmodifiable(); 159 160 tupleEntry.setTuple( tuple ); 161 162 try 163 { 164 safeCollect( tupleEntry ); 165 } 166 finally 167 { 168 Tuples.setUnmodifiable( tuple, isUnmodifiable ); 169 } 170 } 171 172 private void safeCollect( TupleEntry tupleEntry ) 173 { 174 try 175 { 176 collect( tupleEntry ); 177 } 178 catch( IOException exception ) 179 { 180 throw new TupleException( "unable to collect tuple", exception ); 181 } 182 } 183 184 protected abstract void collect( TupleEntry tupleEntry ) throws IOException; 185 186 /** 187 * Method close closes the underlying resource being written to. 188 * <p> 189 * This method should be called when when an instance is returned via 190 * {@link cascading.tap.Tap#openForWrite(cascading.flow.FlowProcess)} 191 * and no more {@link Tuple} instances will be written out. 192 * <p> 193 * This method must not be called when an instance is returned from {@code getOutputCollector()} from any of 194 * the relevant {@link cascading.operation.OperationCall} implementations (inside a Function, Aggregator, or Buffer). 195 */ 196 @Override 197 public void close() 198 { 199 // do nothing 200 } 201 }