001/*
002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.tuple.collect;
023
024import java.io.Closeable;
025import java.io.File;
026import java.io.Flushable;
027import java.io.IOException;
028import java.util.Collection;
029import java.util.Collections;
030import java.util.Iterator;
031import java.util.LinkedList;
032import java.util.List;
033
034import cascading.flow.FlowProcess;
035import cascading.tuple.Tuple;
036import cascading.tuple.TupleException;
037import cascading.tuple.io.TupleInputStream;
038import cascading.tuple.io.TupleOutputStream;
039import cascading.tuple.util.TupleViews;
040import cascading.util.CloseableIterator;
041import org.slf4j.Logger;
042import org.slf4j.LoggerFactory;
043
044/**
045 * Class SpillableTupleList is a simple durable Collection that can spill its contents to disk when the
046 * {@code threshold} is met.
047 * <p>
048 * Using a {@code threshold } of -1 will disable the spill, all values will remain in memory.
049 * <p>
050 * This class is used by the {@link cascading.pipe.CoGroup} pipe, to set properties specific to a given
051 * CoGroup instance, see the {@link cascading.pipe.CoGroup#getConfigDef()} method.
052 * <p>
053 * Use the {@link SpillableProps} fluent helper class to set properties.
054 *
055 * @see cascading.tuple.hadoop.collect.HadoopSpillableTupleList
056 */
057public abstract class SpillableTupleList implements Collection<Tuple>, Spillable
058  {
059  /** Field LOG */
060  private static final Logger LOG = LoggerFactory.getLogger( SpillableTupleList.class );
061
062  public static int getThreshold( FlowProcess flowProcess, int defaultValue )
063    {
064    String value = (String) flowProcess.getProperty( SpillableProps.LIST_THRESHOLD );
065
066    if( value == null || value.length() == 0 )
067      return defaultValue;
068
069    return Integer.parseInt( value );
070    }
071
072  protected static Class getCodecClass( FlowProcess flowProcess, String defaultCodecs, Class subClass )
073    {
074    String compress = (String) flowProcess.getProperty( SpillableProps.SPILL_COMPRESS );
075
076    if( compress != null && !Boolean.parseBoolean( compress ) )
077      return null;
078
079    String codecs = (String) flowProcess.getProperty( SpillableProps.SPILL_CODECS );
080
081    if( codecs == null || codecs.length() == 0 )
082      codecs = defaultCodecs;
083
084    Class codecClass = null;
085
086    for( String codec : codecs.split( "[,\\s]+" ) )
087      {
088      try
089        {
090        LOG.info( "attempting to load codec: {}", codec );
091        codecClass = Thread.currentThread().getContextClassLoader().loadClass( codec ).asSubclass( subClass );
092
093        if( codecClass != null )
094          {
095          LOG.info( "found codec: {}", codec );
096          break;
097          }
098        }
099      catch( ClassNotFoundException exception )
100        {
101        // do nothing
102        }
103      }
104
105    if( codecClass == null )
106      {
107      LOG.warn( "codecs set, but unable to load any: {}", codecs );
108      return null;
109      }
110
111    return codecClass;
112    }
113
114  private SpillStrategy spillStrategy;
115
116  /** Field files */
117  private List<File> files = Collections.EMPTY_LIST; // lazy init if we do a spill
118  /** Field current */
119  private final List<Object[]> current = new LinkedList<Object[]>();
120  /** Field size */
121  private int size = 0;
122  /** Fields listener */
123  private SpillListener spillListener = SpillListener.NULL;
124
125  private Tuple group;
126
127  protected SpillableTupleList( final int threshold )
128    {
129    this( new SpillStrategy()
130      {
131
132      @Override
133      public boolean doSpill( Spillable spillable, int size )
134        {
135        return size >= threshold;
136        }
137
138      @Override
139      public String getSpillReason( Spillable spillable )
140        {
141        return "met threshold: " + threshold;
142        }
143      } );
144    }
145
146  protected SpillableTupleList( SpillStrategy spillStrategy )
147    {
148    this.spillStrategy = spillStrategy;
149    }
150
151  @Override
152  public void setGrouping( Tuple group )
153    {
154    this.group = group;
155    }
156
157  @Override
158  public Tuple getGrouping()
159    {
160    return group;
161    }
162
163  @Override
164  public void setSpillStrategy( SpillStrategy spillStrategy )
165    {
166    this.spillStrategy = spillStrategy;
167    }
168
169  @Override
170  public void setSpillListener( SpillListener spillListener )
171    {
172    this.spillListener = spillListener;
173    }
174
175  @Override
176  public int spillCount()
177    {
178    return files.size();
179    }
180
181  private class SpilledListIterator implements Iterator<Tuple>
182    {
183    int fileIndex = 0;
184    private Iterator<Tuple> lastIterator;
185    private Iterator<Tuple> iterator;
186
187    private SpilledListIterator()
188      {
189      lastIterator = asTupleIterator();
190      getNextIterator();
191      }
192
193    private void getNextIterator()
194      {
195      if( iterator instanceof Closeable )
196        closeSilent( (Closeable) iterator );
197
198      if( fileIndex < files.size() )
199        iterator = getIteratorFor( files.get( fileIndex++ ) );
200      else
201        iterator = lastIterator;
202      }
203
204    private Iterator<Tuple> getIteratorFor( File file )
205      {
206      spillListener.notifyReadSpillBegin( SpillableTupleList.this );
207
208      return createIterator( createTupleInputStream( file ) );
209      }
210
211    public boolean hasNext()
212      {
213      if( isLastCollection() )
214        return iterator.hasNext();
215
216      if( iterator.hasNext() )
217        return true;
218
219      getNextIterator();
220
221      return hasNext();
222      }
223
224    public Tuple next()
225      {
226      if( isLastCollection() || iterator.hasNext() )
227        return iterator.next();
228
229      getNextIterator();
230
231      return next();
232      }
233
234    private boolean isLastCollection()
235      {
236      return iterator == lastIterator;
237      }
238
239    public void remove()
240      {
241      throw new UnsupportedOperationException( "remove is not supported" );
242      }
243    }
244
245  /**
246   * Method add will add the given {@link cascading.tuple.Tuple} to this list.
247   *
248   * @param tuple of type Tuple
249   */
250  @Override
251  public boolean add( Tuple tuple )
252    {
253    doSpill(); // spill if we break over the threshold
254
255    current.add( Tuple.elements( tuple ).toArray( new Object[ tuple.size() ] ) );
256    size++;
257
258    return true;
259    }
260
261  @Override
262  public int size()
263    {
264    return size;
265    }
266
267  @Override
268  public boolean isEmpty()
269    {
270    return files.isEmpty() && current.size() == 0;
271    }
272
273  private final boolean doSpill()
274    {
275    if( !spillStrategy.doSpill( this, current.size() ) )
276      return false;
277
278    long start = System.currentTimeMillis();
279    spillListener.notifyWriteSpillBegin( this, current.size(), spillStrategy.getSpillReason( this ) );
280
281    File file = createTempFile();
282    TupleOutputStream dataOutputStream = createTupleOutputStream( file );
283
284    try
285      {
286      writeList( dataOutputStream, current );
287      }
288    finally
289      {
290      flushSilent( dataOutputStream );
291      closeSilent( dataOutputStream );
292      }
293
294    spillListener.notifyWriteSpillEnd( this, System.currentTimeMillis() - start );
295
296    if( files == Collections.EMPTY_LIST )
297      files = new LinkedList<File>();
298
299    files.add( file );
300    current.clear();
301
302    return true;
303    }
304
305  private void flushSilent( Flushable flushable )
306    {
307    try
308      {
309      flushable.flush();
310      }
311    catch( IOException exception )
312      {
313      // ignore
314      }
315    }
316
317  private void closeSilent( Closeable closeable )
318    {
319    try
320      {
321      closeable.close();
322      }
323    catch( IOException exception )
324      {
325      // ignore
326      }
327    }
328
329  private void writeList( TupleOutputStream dataOutputStream, List<Object[]> list )
330    {
331    try
332      {
333      dataOutputStream.writeLong( list.size() );
334
335      for( Object[] elements : list )
336        dataOutputStream.writeElementArray( elements );
337      }
338    catch( IOException exception )
339      {
340      throw new TupleException( "unable to write tuple collection to file output stream", exception );
341      }
342    }
343
344  protected abstract TupleOutputStream createTupleOutputStream( File file );
345
346  private Iterator<Tuple> createIterator( final TupleInputStream tupleInputStream )
347    {
348    final long size;
349
350    try
351      {
352      size = tupleInputStream.readLong();
353      }
354    catch( IOException exception )
355      {
356      throw new TupleException( "unable to read 'size' of collection from file input stream", exception );
357      }
358
359    return new CloseableIterator<Tuple>()
360      {
361      Tuple tuple = new Tuple();
362      long count = 0;
363
364      @Override
365      public boolean hasNext()
366        {
367        return count < size;
368        }
369
370      @Override
371      public Tuple next()
372        {
373        try
374          {
375          return tupleInputStream.readTuple( tuple );
376          }
377        catch( IOException exception )
378          {
379          throw new TupleException( "unable to read next tuple from file input stream containing: " + size + " tuples, successfully read tuples: " + count, exception );
380          }
381        finally
382          {
383          count++;
384          }
385        }
386
387      @Override
388      public void remove()
389        {
390        throw new UnsupportedOperationException( "remove is not supported" );
391        }
392
393      @Override
394      public void close() throws IOException
395        {
396        tupleInputStream.close();
397        }
398      };
399    }
400
401  protected abstract TupleInputStream createTupleInputStream( File file );
402
403  private File createTempFile()
404    {
405    try
406      {
407      File file = File.createTempFile( "cascading-spillover", null );
408      file.deleteOnExit();
409
410      return file;
411      }
412    catch( IOException exception )
413      {
414      throw new TupleException( "unable to create temporary file", exception );
415      }
416    }
417
418  @Override
419  public void clear()
420    {
421    files.clear();
422    current.clear();
423    size = 0;
424    }
425
426  @Override
427  public Iterator<Tuple> iterator()
428    {
429    if( files.isEmpty() )
430      return asTupleIterator();
431
432    return new SpilledListIterator();
433    }
434
435  private Iterator<Tuple> asTupleIterator()
436    {
437    final Tuple tuple = TupleViews.createObjectArray();
438    final Iterator<Object[]> iterator = current.iterator();
439
440    return new Iterator<Tuple>()
441      {
442      @Override
443      public boolean hasNext()
444        {
445        return iterator.hasNext();
446        }
447
448      @Override
449      public Tuple next()
450        {
451        return TupleViews.reset( tuple, iterator.next() );
452        }
453
454      @Override
455      public void remove()
456        {
457        }
458      };
459    }
460
461  // collection methods, this class cannot only be added to, so they aren't implemented
462  @Override
463  public boolean contains( Object object )
464    {
465    return false;
466    }
467
468  @Override
469  public Object[] toArray()
470    {
471    return new Object[ 0 ];
472    }
473
474  @Override
475  public <T> T[] toArray( T[] ts )
476    {
477    return null;
478    }
479
480  @Override
481  public boolean remove( Object object )
482    {
483    return false;
484    }
485
486  @Override
487  public boolean containsAll( Collection<?> objects )
488    {
489    return false;
490    }
491
492  @Override
493  public boolean addAll( Collection<? extends Tuple> tuples )
494    {
495    return false;
496    }
497
498  @Override
499  public boolean removeAll( Collection<?> objects )
500    {
501    return false;
502    }
503
504  @Override
505  public boolean retainAll( Collection<?> objects )
506    {
507    return false;
508    }
509  }