001/*
002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 *
004 * Project and contact information: http://www.cascading.org/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.nested.json.hadoop2;
022
023import java.io.ByteArrayInputStream;
024import java.io.ByteArrayOutputStream;
025import java.io.IOException;
026import java.io.InputStreamReader;
027import java.io.OutputStreamWriter;
028import java.nio.charset.Charset;
029
030import cascading.flow.FlowProcess;
031import cascading.nested.json.JSONCoercibleType;
032import cascading.scheme.SinkCall;
033import cascading.scheme.SourceCall;
034import cascading.scheme.hadoop.TextLine;
035import cascading.tuple.Fields;
036import cascading.tuple.Tuple;
037import cascading.tuple.TupleEntry;
038import com.fasterxml.jackson.databind.DeserializationFeature;
039import com.fasterxml.jackson.databind.JsonNode;
040import com.fasterxml.jackson.databind.ObjectMapper;
041import org.apache.hadoop.conf.Configuration;
042import org.apache.hadoop.io.Text;
043import org.apache.hadoop.mapred.OutputCollector;
044import org.apache.hadoop.mapred.RecordReader;
045
046/**
047 * A JSONTextLine is a type of {@link cascading.scheme.Scheme} for JSON text files. Files are broken into
048 * lines, where each line is a JSON object. Either line-feed or carriage-return are used to signal end of line.
049 * <p>
050 * By default, this scheme returns a {@link Tuple} with one field, "json" with the type {@link JSONCoercibleType}.
051 * <p>
052 * Any {@link Fields} object passed to the constructor will have the JSONCoercibleType.TYPE type applied.
053 * <p>
054 * To create a binary JSON file, use the {@link cascading.scheme.hadoop.SequenceFile} Scheme with one or more
055 * fields having the JSONCoercibleType type.
056 */
057public class JSONTextLine extends TextLine
058  {
059  public static final Fields DEFAULT_FIELDS = new Fields( "json" ).applyTypes( JSONCoercibleType.TYPE );
060
061  private ObjectMapper mapper = new ObjectMapper();
062
063  {
064  // prevents json object from being created with duplicate names at the same level
065  mapper.setConfig( mapper.getDeserializationConfig()
066    .with( DeserializationFeature.FAIL_ON_READING_DUP_TREE_KEY ) );
067  }
068
069  /**
070   * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the
071   * Hadoop based {@link cascading.flow.FlowConnector} instances returning results
072   * with the default field named "json".
073   */
074  public JSONTextLine()
075    {
076    this( DEFAULT_FIELDS );
077    }
078
079  /**
080   * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the
081   * Hadoop based {@link cascading.flow.FlowConnector} instances.
082   *
083   * @param fields of Fields
084   */
085  public JSONTextLine( Fields fields )
086    {
087    this( fields, null, DEFAULT_CHARSET );
088    }
089
090  /**
091   * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the
092   * Hadoop based {@link cascading.flow.FlowConnector} instances.
093   *
094   * @param fields      of Fields
095   * @param charsetName of String
096   */
097  public JSONTextLine( Fields fields, String charsetName )
098    {
099    this( fields, null, charsetName );
100    }
101
102  /**
103   * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the
104   * Hadoop based {@link cascading.flow.FlowConnector} instances.
105   *
106   * @param fields          of Fields
107   * @param sinkCompression of Compress
108   */
109  public JSONTextLine( Fields fields, Compress sinkCompression )
110    {
111    this( fields, sinkCompression, DEFAULT_CHARSET );
112    }
113
114  /**
115   * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the
116   * Hadoop based {@link cascading.flow.FlowConnector} instances.
117   *
118   * @param fields          of Fields
119   * @param sinkCompression of Compress
120   * @param charsetName     of String
121   */
122  public JSONTextLine( Fields fields, Compress sinkCompression, String charsetName )
123    {
124    super( sinkCompression );
125
126    if( fields == null )
127      throw new IllegalArgumentException( "fields may not be null" );
128
129    if( !fields.isDefined() )
130      throw new IllegalArgumentException( "fields argument must declare a single field" );
131
132    if( fields.size() != 1 )
133      throw new IllegalArgumentException( "may only declare a single source/sink field in the fields argument" );
134
135    fields = fields.hasTypes() ? fields : fields.applyTypes( JSONCoercibleType.TYPE );
136
137    setSinkFields( fields );
138    setSourceFields( fields );
139
140    // throws an exception if not found
141    setCharsetName( charsetName );
142    }
143
144  @Override
145  protected void sourceHandleInput( SourceCall<Object[], RecordReader> sourceCall ) throws IOException
146    {
147    TupleEntry result = sourceCall.getIncomingEntry();
148
149    Object[] context = sourceCall.getContext();
150
151    Text text = (Text) context[ 1 ];
152    JsonNode jsonNode = null;
153
154    if( text.getLength() != 0 )
155      {
156      ByteArrayInputStream inputStream = new ByteArrayInputStream( text.getBytes(), 0, text.getLength() );
157      InputStreamReader reader = new InputStreamReader( inputStream, (Charset) context[ 2 ] );
158      jsonNode = mapper.readTree( reader );
159      }
160
161    result.setObject( 0, jsonNode );
162    }
163
164  @Override
165  public void sink( FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall ) throws IOException
166    {
167    Text text = (Text) sinkCall.getContext()[ 0 ];
168    Charset charset = (Charset) sinkCall.getContext()[ 1 ];
169
170    JsonNode jsonNode = (JsonNode) sinkCall.getOutgoingEntry().getTuple().getObject( 0 );
171
172    if( jsonNode == null )
173      {
174      text.set( "" );
175      }
176    else
177      {
178      ByteArrayOutputStream outputStream = new ByteArrayOutputStream( 1024 );
179      OutputStreamWriter writer = new OutputStreamWriter( outputStream, charset );
180
181      mapper.writeValue( writer, jsonNode );
182
183      writer.close();
184
185      text.set( outputStream.toByteArray() );
186      }
187
188    // it's ok to use NULL here so the collector does not write anything
189    sinkCall.getOutput().collect( null, text );
190    }
191  }