001/*
002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.tuple.hadoop.collect;
023
024import java.io.File;
025import java.io.FileInputStream;
026import java.io.FileOutputStream;
027import java.io.IOException;
028import java.io.InputStream;
029import java.io.OutputStream;
030
031import cascading.flow.FlowProcess;
032import cascading.flow.FlowProcessWrapper;
033import cascading.tuple.TupleException;
034import cascading.tuple.collect.SpillableTupleList;
035import cascading.tuple.hadoop.TupleSerialization;
036import cascading.tuple.hadoop.io.HadoopTupleInputStream;
037import cascading.tuple.hadoop.io.HadoopTupleOutputStream;
038import cascading.tuple.io.TupleInputStream;
039import cascading.tuple.io.TupleOutputStream;
040import org.apache.hadoop.conf.Configuration;
041import org.apache.hadoop.io.compress.CodecPool;
042import org.apache.hadoop.io.compress.CompressionCodec;
043import org.apache.hadoop.io.compress.Compressor;
044import org.apache.hadoop.io.compress.Decompressor;
045import org.apache.hadoop.util.ReflectionUtils;
046import org.slf4j.Logger;
047import org.slf4j.LoggerFactory;
048
049/**
050 * SpillableTupleList is a simple {@link Iterable} object that can store an unlimited number of {@link cascading.tuple.Tuple} instances by spilling
051 * excess to a temporary disk file.
052 * <p>
053 * Spills will automatically be compressed using the {@link #defaultCodecs} values. To disable compression or
054 * change the codecs, see {@link cascading.tuple.collect.SpillableProps#SPILL_COMPRESS} and {@link cascading.tuple.collect.SpillableProps#SPILL_CODECS}.
055 * <p>
056 * It is recommended to add Lzo if available.
057 * {@code "org.apache.hadoop.io.compress.LzoCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec" }
058 */
059public class HadoopSpillableTupleList extends SpillableTupleList
060  {
061  private static final Logger LOG = LoggerFactory.getLogger( HadoopSpillableTupleList.class );
062
063  public static final String defaultCodecs = "org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec";
064
065  /** Field codec */
066  private final CompressionCodec codec;
067  /** Field serializationElementWriter */
068  private final TupleSerialization tupleSerialization;
069
070  public static synchronized CompressionCodec getCodec( FlowProcess<? extends Configuration> flowProcess, String defaultCodecs )
071    {
072    Class<? extends CompressionCodec> codecClass = getCodecClass( flowProcess, defaultCodecs, CompressionCodec.class );
073
074    if( codecClass == null )
075      return null;
076
077    if( flowProcess instanceof FlowProcessWrapper )
078      flowProcess = ( (FlowProcessWrapper) flowProcess ).getDelegate();
079
080    return ReflectionUtils.newInstance( codecClass, flowProcess.getConfig() );
081    }
082
083  /**
084   * Constructor SpillableTupleList creates a new SpillableTupleList instance using the given threshold value, and
085   * the first available compression codec, if any.
086   *
087   * @param threshold of type long
088   * @param codec     of type CompressionCodec
089   */
090  public HadoopSpillableTupleList( int threshold, CompressionCodec codec, Configuration configuration )
091    {
092    super( threshold );
093    this.codec = codec;
094
095    if( configuration == null )
096      this.tupleSerialization = new TupleSerialization();
097    else
098      this.tupleSerialization = new TupleSerialization( configuration );
099    }
100
101  public HadoopSpillableTupleList( int threshold, TupleSerialization tupleSerialization, CompressionCodec codec )
102    {
103    super( threshold );
104    this.tupleSerialization = tupleSerialization;
105    this.codec = codec;
106    }
107
108  @Override
109  protected TupleOutputStream createTupleOutputStream( File file )
110    {
111    OutputStream outputStream;
112
113    try
114      {
115      outputStream = new FileOutputStream( file );
116
117      Compressor compressor = null;
118
119      if( codec != null )
120        {
121        compressor = getCompressor();
122        outputStream = codec.createOutputStream( outputStream, compressor );
123        }
124
125      final Compressor finalCompressor = compressor;
126
127      return new HadoopTupleOutputStream( outputStream, tupleSerialization.getElementWriter() )
128        {
129        @Override
130        public void close() throws IOException
131          {
132          try
133            {
134            super.close();
135            }
136          finally
137            {
138            if( finalCompressor != null )
139              CodecPool.returnCompressor( finalCompressor );
140            }
141          }
142        };
143      }
144    catch( IOException exception )
145      {
146      throw new TupleException( "unable to create temporary file input stream", exception );
147      }
148    }
149
150  private Compressor getCompressor()
151    {
152    // some codecs are using direct memory, and the gc for direct memory cannot sometimes keep up
153    // so we attempt to force a gc if we see a OOME once.
154    try
155      {
156      return CodecPool.getCompressor( codec );
157      }
158    catch( OutOfMemoryError error )
159      {
160      System.gc();
161      LOG.info( "received OOME when allocating compressor for codec: {}, retrying once", codec.getClass().getCanonicalName(), error );
162
163      return CodecPool.getCompressor( codec );
164      }
165    }
166
167  @Override
168  protected TupleInputStream createTupleInputStream( File file )
169    {
170    try
171      {
172      InputStream inputStream;
173
174      inputStream = new FileInputStream( file );
175
176      Decompressor decompressor = null;
177
178      if( codec != null )
179        {
180        decompressor = getDecompressor();
181        inputStream = codec.createInputStream( inputStream, decompressor );
182        }
183
184      final Decompressor finalDecompressor = decompressor;
185      return new HadoopTupleInputStream( inputStream, tupleSerialization.getElementReader() )
186        {
187        @Override
188        public void close() throws IOException
189          {
190          try
191            {
192            super.close();
193            }
194          finally
195            {
196            if( finalDecompressor != null )
197              CodecPool.returnDecompressor( finalDecompressor );
198            }
199          }
200        };
201      }
202    catch( IOException exception )
203      {
204      throw new TupleException( "unable to create temporary file output stream", exception );
205      }
206    }
207
208  private Decompressor getDecompressor()
209    {
210    // some codecs are using direct memory, and the gc for direct memory cannot sometimes keep up
211    // so we attempt to force a gc if we see a OOME once.
212    try
213      {
214      return CodecPool.getDecompressor( codec );
215      }
216    catch( OutOfMemoryError error )
217      {
218      System.gc();
219      LOG.info( "received OOME when allocating decompressor for codec: {}, retrying once", codec.getClass().getCanonicalName(), error );
220
221      return CodecPool.getDecompressor( codec );
222      }
223    }
224  }