001/* 002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.tuple.collect; 023 024import java.io.Closeable; 025import java.io.File; 026import java.io.Flushable; 027import java.io.IOException; 028import java.util.Collection; 029import java.util.Collections; 030import java.util.Iterator; 031import java.util.LinkedList; 032import java.util.List; 033 034import cascading.flow.FlowProcess; 035import cascading.tuple.Tuple; 036import cascading.tuple.TupleException; 037import cascading.tuple.io.TupleInputStream; 038import cascading.tuple.io.TupleOutputStream; 039import cascading.tuple.util.TupleViews; 040import cascading.util.CloseableIterator; 041import org.slf4j.Logger; 042import org.slf4j.LoggerFactory; 043 044/** 045 * Class SpillableTupleList is a simple durable Collection that can spill its contents to disk when the 046 * {@code threshold} is met. 047 * <p> 048 * Using a {@code threshold } of -1 will disable the spill, all values will remain in memory. 049 * <p> 050 * This class is used by the {@link cascading.pipe.CoGroup} pipe, to set properties specific to a given 051 * CoGroup instance, see the {@link cascading.pipe.CoGroup#getConfigDef()} method. 052 * <p> 053 * Use the {@link SpillableProps} fluent helper class to set properties. 054 * 055 * @see cascading.tuple.hadoop.collect.HadoopSpillableTupleList 056 */ 057public abstract class SpillableTupleList implements Collection<Tuple>, Spillable 058 { 059 /** Field LOG */ 060 private static final Logger LOG = LoggerFactory.getLogger( SpillableTupleList.class ); 061 062 public static int getThreshold( FlowProcess flowProcess, int defaultValue ) 063 { 064 String value = (String) flowProcess.getProperty( SpillableProps.LIST_THRESHOLD ); 065 066 if( value == null || value.length() == 0 ) 067 return defaultValue; 068 069 return Integer.parseInt( value ); 070 } 071 072 protected static Class getCodecClass( FlowProcess flowProcess, String defaultCodecs, Class subClass ) 073 { 074 String compress = (String) flowProcess.getProperty( SpillableProps.SPILL_COMPRESS ); 075 076 if( compress != null && !Boolean.parseBoolean( compress ) ) 077 return null; 078 079 String codecs = (String) flowProcess.getProperty( SpillableProps.SPILL_CODECS ); 080 081 if( codecs == null || codecs.length() == 0 ) 082 codecs = defaultCodecs; 083 084 Class codecClass = null; 085 086 for( String codec : codecs.split( "[,\\s]+" ) ) 087 { 088 try 089 { 090 LOG.info( "attempting to load codec: {}", codec ); 091 codecClass = Thread.currentThread().getContextClassLoader().loadClass( codec ).asSubclass( subClass ); 092 093 if( codecClass != null ) 094 { 095 LOG.info( "found codec: {}", codec ); 096 break; 097 } 098 } 099 catch( ClassNotFoundException exception ) 100 { 101 // do nothing 102 } 103 } 104 105 if( codecClass == null ) 106 { 107 LOG.warn( "codecs set, but unable to load any: {}", codecs ); 108 return null; 109 } 110 111 return codecClass; 112 } 113 114 private SpillStrategy spillStrategy; 115 116 /** Field files */ 117 private List<File> files = Collections.EMPTY_LIST; // lazy init if we do a spill 118 /** Field current */ 119 private final List<Object[]> current = new LinkedList<Object[]>(); 120 /** Field size */ 121 private int size = 0; 122 /** Fields listener */ 123 private SpillListener spillListener = SpillListener.NULL; 124 125 private Tuple group; 126 127 protected SpillableTupleList( final int threshold ) 128 { 129 this( new SpillStrategy() 130 { 131 132 @Override 133 public boolean doSpill( Spillable spillable, int size ) 134 { 135 return size >= threshold; 136 } 137 138 @Override 139 public String getSpillReason( Spillable spillable ) 140 { 141 return "met threshold: " + threshold; 142 } 143 } ); 144 } 145 146 protected SpillableTupleList( SpillStrategy spillStrategy ) 147 { 148 this.spillStrategy = spillStrategy; 149 } 150 151 @Override 152 public void setGrouping( Tuple group ) 153 { 154 this.group = group; 155 } 156 157 @Override 158 public Tuple getGrouping() 159 { 160 return group; 161 } 162 163 @Override 164 public void setSpillStrategy( SpillStrategy spillStrategy ) 165 { 166 this.spillStrategy = spillStrategy; 167 } 168 169 @Override 170 public void setSpillListener( SpillListener spillListener ) 171 { 172 this.spillListener = spillListener; 173 } 174 175 @Override 176 public int spillCount() 177 { 178 return files.size(); 179 } 180 181 private class SpilledListIterator implements Iterator<Tuple> 182 { 183 int fileIndex = 0; 184 private Iterator<Tuple> lastIterator; 185 private Iterator<Tuple> iterator; 186 187 private SpilledListIterator() 188 { 189 lastIterator = asTupleIterator(); 190 getNextIterator(); 191 } 192 193 private void getNextIterator() 194 { 195 if( iterator instanceof Closeable ) 196 closeSilent( (Closeable) iterator ); 197 198 if( fileIndex < files.size() ) 199 iterator = getIteratorFor( files.get( fileIndex++ ) ); 200 else 201 iterator = lastIterator; 202 } 203 204 private Iterator<Tuple> getIteratorFor( File file ) 205 { 206 spillListener.notifyReadSpillBegin( SpillableTupleList.this ); 207 208 return createIterator( createTupleInputStream( file ) ); 209 } 210 211 public boolean hasNext() 212 { 213 if( isLastCollection() ) 214 return iterator.hasNext(); 215 216 if( iterator.hasNext() ) 217 return true; 218 219 getNextIterator(); 220 221 return hasNext(); 222 } 223 224 public Tuple next() 225 { 226 if( isLastCollection() || iterator.hasNext() ) 227 return iterator.next(); 228 229 getNextIterator(); 230 231 return next(); 232 } 233 234 private boolean isLastCollection() 235 { 236 return iterator == lastIterator; 237 } 238 239 public void remove() 240 { 241 throw new UnsupportedOperationException( "remove is not supported" ); 242 } 243 } 244 245 /** 246 * Method add will add the given {@link cascading.tuple.Tuple} to this list. 247 * 248 * @param tuple of type Tuple 249 */ 250 @Override 251 public boolean add( Tuple tuple ) 252 { 253 doSpill(); // spill if we break over the threshold 254 255 current.add( Tuple.elements( tuple ).toArray( new Object[ tuple.size() ] ) ); 256 size++; 257 258 return true; 259 } 260 261 @Override 262 public int size() 263 { 264 return size; 265 } 266 267 @Override 268 public boolean isEmpty() 269 { 270 return files.isEmpty() && current.size() == 0; 271 } 272 273 private final boolean doSpill() 274 { 275 if( !spillStrategy.doSpill( this, current.size() ) ) 276 return false; 277 278 long start = System.currentTimeMillis(); 279 spillListener.notifyWriteSpillBegin( this, current.size(), spillStrategy.getSpillReason( this ) ); 280 281 File file = createTempFile(); 282 TupleOutputStream dataOutputStream = createTupleOutputStream( file ); 283 284 try 285 { 286 writeList( dataOutputStream, current ); 287 } 288 finally 289 { 290 flushSilent( dataOutputStream ); 291 closeSilent( dataOutputStream ); 292 } 293 294 spillListener.notifyWriteSpillEnd( this, System.currentTimeMillis() - start ); 295 296 if( files == Collections.EMPTY_LIST ) 297 files = new LinkedList<File>(); 298 299 files.add( file ); 300 current.clear(); 301 302 return true; 303 } 304 305 private void flushSilent( Flushable flushable ) 306 { 307 try 308 { 309 flushable.flush(); 310 } 311 catch( IOException exception ) 312 { 313 // ignore 314 } 315 } 316 317 private void closeSilent( Closeable closeable ) 318 { 319 try 320 { 321 closeable.close(); 322 } 323 catch( IOException exception ) 324 { 325 // ignore 326 } 327 } 328 329 private void writeList( TupleOutputStream dataOutputStream, List<Object[]> list ) 330 { 331 try 332 { 333 dataOutputStream.writeLong( list.size() ); 334 335 for( Object[] elements : list ) 336 dataOutputStream.writeElementArray( elements ); 337 } 338 catch( IOException exception ) 339 { 340 throw new TupleException( "unable to write tuple collection to file output stream", exception ); 341 } 342 } 343 344 protected abstract TupleOutputStream createTupleOutputStream( File file ); 345 346 private Iterator<Tuple> createIterator( final TupleInputStream tupleInputStream ) 347 { 348 final long size; 349 350 try 351 { 352 size = tupleInputStream.readLong(); 353 } 354 catch( IOException exception ) 355 { 356 throw new TupleException( "unable to read 'size' of collection from file input stream", exception ); 357 } 358 359 return new CloseableIterator<Tuple>() 360 { 361 Tuple tuple = new Tuple(); 362 long count = 0; 363 364 @Override 365 public boolean hasNext() 366 { 367 return count < size; 368 } 369 370 @Override 371 public Tuple next() 372 { 373 try 374 { 375 return tupleInputStream.readTuple( tuple ); 376 } 377 catch( IOException exception ) 378 { 379 throw new TupleException( "unable to read next tuple from file input stream containing: " + size + " tuples, successfully read tuples: " + count, exception ); 380 } 381 finally 382 { 383 count++; 384 } 385 } 386 387 @Override 388 public void remove() 389 { 390 throw new UnsupportedOperationException( "remove is not supported" ); 391 } 392 393 @Override 394 public void close() throws IOException 395 { 396 tupleInputStream.close(); 397 } 398 }; 399 } 400 401 protected abstract TupleInputStream createTupleInputStream( File file ); 402 403 private File createTempFile() 404 { 405 try 406 { 407 File file = File.createTempFile( "cascading-spillover", null ); 408 file.deleteOnExit(); 409 410 return file; 411 } 412 catch( IOException exception ) 413 { 414 throw new TupleException( "unable to create temporary file", exception ); 415 } 416 } 417 418 @Override 419 public void clear() 420 { 421 files.clear(); 422 current.clear(); 423 size = 0; 424 } 425 426 @Override 427 public Iterator<Tuple> iterator() 428 { 429 if( files.isEmpty() ) 430 return asTupleIterator(); 431 432 return new SpilledListIterator(); 433 } 434 435 private Iterator<Tuple> asTupleIterator() 436 { 437 final Tuple tuple = TupleViews.createObjectArray(); 438 final Iterator<Object[]> iterator = current.iterator(); 439 440 return new Iterator<Tuple>() 441 { 442 @Override 443 public boolean hasNext() 444 { 445 return iterator.hasNext(); 446 } 447 448 @Override 449 public Tuple next() 450 { 451 return TupleViews.reset( tuple, iterator.next() ); 452 } 453 454 @Override 455 public void remove() 456 { 457 } 458 }; 459 } 460 461 // collection methods, this class cannot only be added to, so they aren't implemented 462 @Override 463 public boolean contains( Object object ) 464 { 465 return false; 466 } 467 468 @Override 469 public Object[] toArray() 470 { 471 return new Object[ 0 ]; 472 } 473 474 @Override 475 public <T> T[] toArray( T[] ts ) 476 { 477 return null; 478 } 479 480 @Override 481 public boolean remove( Object object ) 482 { 483 return false; 484 } 485 486 @Override 487 public boolean containsAll( Collection<?> objects ) 488 { 489 return false; 490 } 491 492 @Override 493 public boolean addAll( Collection<? extends Tuple> tuples ) 494 { 495 return false; 496 } 497 498 @Override 499 public boolean removeAll( Collection<?> objects ) 500 { 501 return false; 502 } 503 504 @Override 505 public boolean retainAll( Collection<?> objects ) 506 { 507 return false; 508 } 509 }