001/* 002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.nested.json.hadoop2; 022 023import java.io.ByteArrayInputStream; 024import java.io.ByteArrayOutputStream; 025import java.io.IOException; 026import java.io.InputStreamReader; 027import java.io.OutputStreamWriter; 028import java.nio.charset.Charset; 029 030import cascading.flow.FlowProcess; 031import cascading.nested.json.JSONCoercibleType; 032import cascading.scheme.SinkCall; 033import cascading.scheme.SourceCall; 034import cascading.scheme.hadoop.TextLine; 035import cascading.tuple.Fields; 036import cascading.tuple.Tuple; 037import cascading.tuple.TupleEntry; 038import com.fasterxml.jackson.databind.DeserializationFeature; 039import com.fasterxml.jackson.databind.JsonNode; 040import com.fasterxml.jackson.databind.ObjectMapper; 041import org.apache.hadoop.conf.Configuration; 042import org.apache.hadoop.io.Text; 043import org.apache.hadoop.mapred.OutputCollector; 044import org.apache.hadoop.mapred.RecordReader; 045 046/** 047 * A JSONTextLine is a type of {@link cascading.scheme.Scheme} for JSON text files. Files are broken into 048 * lines, where each line is a JSON object. Either line-feed or carriage-return are used to signal end of line. 049 * <p> 050 * By default, this scheme returns a {@link Tuple} with one field, "json" with the type {@link JSONCoercibleType}. 051 * <p> 052 * Any {@link Fields} object passed to the constructor will have the JSONCoercibleType.TYPE type applied. 053 * <p> 054 * To create a binary JSON file, use the {@link cascading.scheme.hadoop.SequenceFile} Scheme with one or more 055 * fields having the JSONCoercibleType type. 056 */ 057public class JSONTextLine extends TextLine 058 { 059 public static final Fields DEFAULT_FIELDS = new Fields( "json" ).applyTypes( JSONCoercibleType.TYPE ); 060 061 private ObjectMapper mapper = new ObjectMapper(); 062 063 { 064 // prevents json object from being created with duplicate names at the same level 065 mapper.setConfig( mapper.getDeserializationConfig() 066 .with( DeserializationFeature.FAIL_ON_READING_DUP_TREE_KEY ) ); 067 } 068 069 /** 070 * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the 071 * Hadoop based {@link cascading.flow.FlowConnector} instances returning results 072 * with the default field named "json". 073 */ 074 public JSONTextLine() 075 { 076 this( DEFAULT_FIELDS ); 077 } 078 079 /** 080 * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the 081 * Hadoop based {@link cascading.flow.FlowConnector} instances. 082 * 083 * @param fields of Fields 084 */ 085 public JSONTextLine( Fields fields ) 086 { 087 this( fields, null, DEFAULT_CHARSET ); 088 } 089 090 /** 091 * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the 092 * Hadoop based {@link cascading.flow.FlowConnector} instances. 093 * 094 * @param fields of Fields 095 * @param charsetName of String 096 */ 097 public JSONTextLine( Fields fields, String charsetName ) 098 { 099 this( fields, null, charsetName ); 100 } 101 102 /** 103 * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the 104 * Hadoop based {@link cascading.flow.FlowConnector} instances. 105 * 106 * @param fields of Fields 107 * @param sinkCompression of Compress 108 */ 109 public JSONTextLine( Fields fields, Compress sinkCompression ) 110 { 111 this( fields, sinkCompression, DEFAULT_CHARSET ); 112 } 113 114 /** 115 * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the 116 * Hadoop based {@link cascading.flow.FlowConnector} instances. 117 * 118 * @param fields of Fields 119 * @param sinkCompression of Compress 120 * @param charsetName of String 121 */ 122 public JSONTextLine( Fields fields, Compress sinkCompression, String charsetName ) 123 { 124 super( sinkCompression ); 125 126 if( fields == null ) 127 throw new IllegalArgumentException( "fields may not be null" ); 128 129 if( !fields.isDefined() ) 130 throw new IllegalArgumentException( "fields argument must declare a single field" ); 131 132 if( fields.size() != 1 ) 133 throw new IllegalArgumentException( "may only declare a single source/sink field in the fields argument" ); 134 135 fields = fields.hasTypes() ? fields : fields.applyTypes( JSONCoercibleType.TYPE ); 136 137 setSinkFields( fields ); 138 setSourceFields( fields ); 139 140 // throws an exception if not found 141 setCharsetName( charsetName ); 142 } 143 144 @Override 145 protected void sourceHandleInput( SourceCall<Object[], RecordReader> sourceCall ) throws IOException 146 { 147 TupleEntry result = sourceCall.getIncomingEntry(); 148 149 Object[] context = sourceCall.getContext(); 150 151 Text text = (Text) context[ 1 ]; 152 JsonNode jsonNode = null; 153 154 if( text.getLength() != 0 ) 155 { 156 ByteArrayInputStream inputStream = new ByteArrayInputStream( text.getBytes(), 0, text.getLength() ); 157 InputStreamReader reader = new InputStreamReader( inputStream, (Charset) context[ 2 ] ); 158 jsonNode = mapper.readTree( reader ); 159 } 160 161 result.setObject( 0, jsonNode ); 162 } 163 164 @Override 165 public void sink( FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall ) throws IOException 166 { 167 Text text = (Text) sinkCall.getContext()[ 0 ]; 168 Charset charset = (Charset) sinkCall.getContext()[ 1 ]; 169 170 JsonNode jsonNode = (JsonNode) sinkCall.getOutgoingEntry().getTuple().getObject( 0 ); 171 172 if( jsonNode == null ) 173 { 174 text.set( "" ); 175 } 176 else 177 { 178 ByteArrayOutputStream outputStream = new ByteArrayOutputStream( 1024 ); 179 OutputStreamWriter writer = new OutputStreamWriter( outputStream, charset ); 180 181 mapper.writeValue( writer, jsonNode ); 182 183 writer.close(); 184 185 text.set( outputStream.toByteArray() ); 186 } 187 188 // it's ok to use NULL here so the collector does not write anything 189 sinkCall.getOutput().collect( null, text ); 190 } 191 }