001/* 002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.scheme.hadoop; 023 024import java.beans.ConstructorProperties; 025import java.io.IOException; 026import java.nio.charset.Charset; 027 028import cascading.flow.FlowProcess; 029import cascading.management.annotation.Property; 030import cascading.management.annotation.PropertyDescription; 031import cascading.management.annotation.Visibility; 032import cascading.scheme.SinkCall; 033import cascading.scheme.SourceCall; 034import cascading.scheme.util.DelimitedParser; 035import cascading.tap.CompositeTap; 036import cascading.tap.Tap; 037import cascading.tap.TapException; 038import cascading.tap.hadoop.Hfs; 039import cascading.tuple.Fields; 040import cascading.tuple.Tuple; 041import cascading.tuple.TupleEntry; 042import cascading.tuple.util.TupleViews; 043import org.apache.hadoop.conf.Configuration; 044import org.apache.hadoop.io.LongWritable; 045import org.apache.hadoop.io.Text; 046import org.apache.hadoop.mapred.OutputCollector; 047import org.apache.hadoop.mapred.RecordReader; 048 049/** 050 * Class TextDelimited is a sub-class of {@link TextLine}. It provides direct support for delimited text files, like 051 * TAB (\t) or COMMA (,) delimited files. It also optionally allows for quoted values. 052 * <p> 053 * TextDelimited may also be used to skip the "header" in a file, where the header is defined as the very first line 054 * in every input file. That is, if the byte offset of the current line from the input is zero (0), that line will 055 * be skipped. 056 * <p> 057 * It is assumed if sink/source {@code fields} is set to either {@link Fields#ALL} or {@link Fields#UNKNOWN} and 058 * {@code skipHeader} or {@code hasHeader} is {@code true}, the field names will be retrieved from the header of the 059 * file and used during planning. The header will parsed with the same rules as the body of the file. 060 * <p> 061 * By default headers are not skipped. 062 * <p> 063 * TextDelimited may also be used to write a "header" in a file. The fields names for the header are taken directly 064 * from the declared fields. Or if the declared fields are {@link Fields#ALL} or {@link Fields#UNKNOWN}, the 065 * resolved field names will be used, if any. 066 * <p> 067 * By default headers are not written. 068 * <p> 069 * If {@code hasHeaders} is set to {@code true} on a constructor, both {@code skipHeader} and {@code writeHeader} will 070 * be set to {@code true}. 071 * <p> 072 * By default this {@link cascading.scheme.Scheme} is both {@code strict} and {@code safe}. 073 * <p> 074 * Strict meaning if a line of text does not parse into the expected number of fields, this class will throw a 075 * {@link TapException}. If strict is {@code false}, then {@link Tuple} will be returned with {@code null} values 076 * for the missing fields. 077 * <p> 078 * Safe meaning if a field cannot be coerced into an expected type, a {@code null} will be used for the value. 079 * If safe is {@code false}, a {@link TapException} will be thrown. 080 * <p> 081 * Also by default, {@code quote} strings are not searched for to improve processing speed. If a file is 082 * COMMA delimited but may have COMMA's in a value, the whole value should be surrounded by the quote string, typically 083 * double quotes ({@literal "}). 084 * <p> 085 * Note all empty fields in a line will be returned as {@code null} unless coerced into a new type. 086 * <p> 087 * This Scheme may source/sink {@link Fields#ALL}, when given on the constructor the new instance will automatically 088 * default to strict == false as the number of fields parsed are arbitrary or unknown. A type array may not be given 089 * either, so all values will be returned as Strings. 090 * <p> 091 * By default, all text is encoded/decoded as UTF-8. This can be changed via the {@code charsetName} constructor 092 * argument. 093 * <p> 094 * To override field and line parsing behaviors, sub-class {@link DelimitedParser} or provide a 095 * {@link cascading.scheme.util.FieldTypeResolver} implementation. 096 * <p> 097 * Note that there should be no expectation that TextDelimited, or specifically {@link DelimitedParser}, can handle 098 * all delimited and quoted combinations reliably. Attempting to do so would impair its performance and maintainability. 099 * <p> 100 * Further, it can be safely said any corrupted files will not be supported for obvious reasons. Corrupted files may 101 * result in exceptions or could cause edge cases in the underlying java regular expression engine. 102 * <p> 103 * A large part of Cascading was designed to help users cleans data. Thus the recommendation is to create Flows that 104 * are responsible for cleansing large data-sets when faced with the problem 105 * <p> 106 * DelimitedParser maybe sub-classed and extended if necessary. 107 * 108 * @see TextLine 109 */ 110public class TextDelimited extends TextLine 111 { 112 public static final String DEFAULT_CHARSET = "UTF-8"; 113 114 /** Field delimitedParser */ 115 protected final DelimitedParser delimitedParser; 116 /** Field skipHeader */ 117 private boolean skipHeader; 118 private final boolean writeHeader; 119 120 /** 121 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 122 * {@link Fields#ALL} and using TAB as the default delimiter. 123 * <p> 124 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 125 * with a {@link cascading.pipe.Checkpoint} Tap. 126 */ 127 public TextDelimited() 128 { 129 this( Fields.ALL, null, "\t", null, null ); 130 } 131 132 /** 133 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 134 * {@link Fields#ALL} and using TAB as the default delimiter. 135 * <p> 136 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 137 * with a {@link cascading.pipe.Checkpoint} Tap. 138 * 139 * @param hasHeader of type boolean 140 * @param delimiter of type String 141 */ 142 @ConstructorProperties({"hasHeader", "delimiter"}) 143 public TextDelimited( boolean hasHeader, String delimiter ) 144 { 145 this( Fields.ALL, null, hasHeader, delimiter, null, (Class[]) null ); 146 } 147 148 /** 149 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 150 * {@link Fields#ALL} and using TAB as the default delimiter. 151 * <p> 152 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 153 * with a {@link cascading.pipe.Checkpoint} Tap. 154 * 155 * @param hasHeader of type boolean 156 * @param delimiter of type String 157 * @param quote of type String 158 */ 159 @ConstructorProperties({"hasHeader", "delimiter", "quote"}) 160 public TextDelimited( boolean hasHeader, String delimiter, String quote ) 161 { 162 this( Fields.ALL, null, hasHeader, delimiter, quote, (Class[]) null ); 163 } 164 165 /** 166 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 167 * {@link Fields#ALL} and using the given delimitedParser instance for parsing. 168 * <p> 169 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 170 * with a {@link cascading.pipe.Checkpoint} Tap. 171 * 172 * @param hasHeader of type boolean 173 * @param delimitedParser of type DelimitedParser 174 */ 175 @ConstructorProperties({"hasHeader", "delimitedParser"}) 176 public TextDelimited( boolean hasHeader, DelimitedParser delimitedParser ) 177 { 178 this( Fields.ALL, null, hasHeader, hasHeader, delimitedParser ); 179 } 180 181 /** 182 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 183 * {@link Fields#ALL} and using the given delimitedParser instance for parsing. 184 * <p> 185 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 186 * with a {@link cascading.pipe.Checkpoint} Tap. 187 * <p> 188 * This constructor will set {@code skipHeader} and {@code writeHeader} values to true. 189 * 190 * @param delimitedParser of type DelimitedParser 191 */ 192 @ConstructorProperties({"delimitedParser"}) 193 public TextDelimited( DelimitedParser delimitedParser ) 194 { 195 this( Fields.ALL, null, true, true, delimitedParser ); 196 } 197 198 /** 199 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 200 * {@link Fields#ALL} and using the given delimitedParser instance for parsing. 201 * <p> 202 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 203 * with a {@link cascading.pipe.Checkpoint} Tap. 204 * 205 * @param sinkCompression of type Compress 206 * @param hasHeader of type boolean 207 * @param delimitedParser of type DelimitedParser 208 */ 209 @ConstructorProperties({"sinkCompression", "hasHeader", "delimitedParser"}) 210 public TextDelimited( Compress sinkCompression, boolean hasHeader, DelimitedParser delimitedParser ) 211 { 212 this( Fields.ALL, sinkCompression, hasHeader, hasHeader, delimitedParser ); 213 } 214 215 /** 216 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 217 * {@link Fields#ALL} and using the given delimitedParser instance for parsing. 218 * <p> 219 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 220 * with a {@link cascading.pipe.Checkpoint} Tap. 221 * <p> 222 * This constructor will set {@code skipHeader} and {@code writeHeader} values to true. 223 * 224 * @param delimitedParser of type DelimitedParser 225 */ 226 @ConstructorProperties({"sinkCompression", "delimitedParser"}) 227 public TextDelimited( Compress sinkCompression, DelimitedParser delimitedParser ) 228 { 229 this( Fields.ALL, sinkCompression, true, true, delimitedParser ); 230 } 231 232 /** 233 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 234 * {@link Fields#ALL} and using TAB as the default delimiter. 235 * <p> 236 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 237 * with a {@link cascading.pipe.Checkpoint} Tap. 238 * 239 * @param sinkCompression of type Compress 240 * @param hasHeader of type boolean 241 * @param delimiter of type String 242 * @param quote of type String 243 */ 244 @ConstructorProperties({"sinkCompression", "hasHeader", "delimiter", "quote"}) 245 public TextDelimited( Compress sinkCompression, boolean hasHeader, String delimiter, String quote ) 246 { 247 this( Fields.ALL, sinkCompression, hasHeader, delimiter, quote, (Class[]) null ); 248 } 249 250 /** 251 * Constructor TextDelimited creates a new TextDelimited instance with TAB as the default delimiter. 252 * 253 * @param fields of type Fields 254 */ 255 @ConstructorProperties({"fields"}) 256 public TextDelimited( Fields fields ) 257 { 258 this( fields, null, "\t", null, null ); 259 } 260 261 /** 262 * Constructor TextDelimited creates a new TextDelimited instance. 263 * 264 * @param fields of type Fields 265 * @param delimiter of type String 266 */ 267 @ConstructorProperties({"fields", "delimiter"}) 268 public TextDelimited( Fields fields, String delimiter ) 269 { 270 this( fields, null, delimiter, null, null ); 271 } 272 273 /** 274 * Constructor TextDelimited creates a new TextDelimited instance. 275 * 276 * @param fields of type Fields 277 * @param hasHeader of type boolean 278 * @param delimiter of type String 279 */ 280 @ConstructorProperties({"fields", "hasHeader", "delimiter"}) 281 public TextDelimited( Fields fields, boolean hasHeader, String delimiter ) 282 { 283 this( fields, null, hasHeader, hasHeader, delimiter, null, null ); 284 } 285 286 /** 287 * Constructor TextDelimited creates a new TextDelimited instance. 288 * 289 * @param fields of type Fields 290 * @param skipHeader of type boolean 291 * @param writeHeader of type boolean 292 * @param delimiter of type String 293 */ 294 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter"}) 295 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter ) 296 { 297 this( fields, null, skipHeader, writeHeader, delimiter, null, null ); 298 } 299 300 /** 301 * Constructor TextDelimited creates a new TextDelimited instance. 302 * 303 * @param fields of type Fields 304 * @param delimiter of type String 305 * @param types of type Class[] 306 */ 307 @ConstructorProperties({"fields", "delimiter", "types"}) 308 public TextDelimited( Fields fields, String delimiter, Class[] types ) 309 { 310 this( fields, null, delimiter, null, types ); 311 } 312 313 /** 314 * Constructor TextDelimited creates a new TextDelimited instance. 315 * 316 * @param fields of type Fields 317 * @param hasHeader of type boolean 318 * @param delimiter of type String 319 * @param types of type Class[] 320 */ 321 @ConstructorProperties({"fields", "hasHeader", "delimiter", "types"}) 322 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, Class[] types ) 323 { 324 this( fields, null, hasHeader, hasHeader, delimiter, null, types ); 325 } 326 327 /** 328 * Constructor TextDelimited creates a new TextDelimited instance. 329 * 330 * @param fields of type Fields 331 * @param skipHeader of type boolean 332 * @param writeHeader of type boolean 333 * @param delimiter of type String 334 * @param types of type Class[] 335 */ 336 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "types"}) 337 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types ) 338 { 339 this( fields, null, skipHeader, writeHeader, delimiter, null, types ); 340 } 341 342 /** 343 * Constructor TextDelimited creates a new TextDelimited instance. 344 * 345 * @param fields of type Fields 346 * @param delimiter of type String 347 * @param quote of type String 348 * @param types of type Class[] 349 */ 350 @ConstructorProperties({"fields", "delimiter", "quote", "types"}) 351 public TextDelimited( Fields fields, String delimiter, String quote, Class[] types ) 352 { 353 this( fields, null, delimiter, quote, types ); 354 } 355 356 /** 357 * Constructor TextDelimited creates a new TextDelimited instance. 358 * 359 * @param fields of type Fields 360 * @param hasHeader of type boolean 361 * @param delimiter of type String 362 * @param quote of type String 363 * @param types of type Class[] 364 */ 365 @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types"}) 366 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types ) 367 { 368 this( fields, null, hasHeader, hasHeader, delimiter, quote, types ); 369 } 370 371 /** 372 * Constructor TextDelimited creates a new TextDelimited instance. 373 * 374 * @param fields of type Fields 375 * @param skipHeader of type boolean 376 * @param writeHeader of type boolean 377 * @param delimiter of type String 378 * @param quote of type String 379 * @param types of type Class[] 380 */ 381 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote", "types"}) 382 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types ) 383 { 384 this( fields, null, skipHeader, writeHeader, delimiter, quote, types ); 385 } 386 387 /** 388 * Constructor TextDelimited creates a new TextDelimited instance. 389 * 390 * @param fields of type Fields 391 * @param delimiter of type String 392 * @param quote of type String 393 * @param types of type Class[] 394 * @param safe of type boolean 395 */ 396 @ConstructorProperties({"fields", "delimiter", "quote", "types", "safe"}) 397 public TextDelimited( Fields fields, String delimiter, String quote, Class[] types, boolean safe ) 398 { 399 this( fields, null, delimiter, quote, types, safe ); 400 } 401 402 /** 403 * Constructor TextDelimited creates a new TextDelimited instance. 404 * 405 * @param fields of type Fields 406 * @param hasHeader of type boolean 407 * @param delimiter of type String 408 * @param quote of type String 409 * @param types of type Class[] 410 * @param safe of type boolean 411 */ 412 @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types", "safe"}) 413 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe ) 414 { 415 this( fields, null, hasHeader, hasHeader, delimiter, quote, types, safe ); 416 } 417 418 /** 419 * Constructor TextDelimited creates a new TextDelimited instance. 420 * 421 * @param fields of type Fields 422 * @param hasHeader of type boolean 423 * @param delimiter of type String 424 * @param quote of type String 425 * @param types of type Class[] 426 * @param safe of type boolean 427 * @param charsetName of type String 428 */ 429 @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types", "safe", "charsetName"}) 430 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe, String charsetName ) 431 { 432 this( fields, null, hasHeader, hasHeader, delimiter, true, quote, types, safe, charsetName ); 433 } 434 435 /** 436 * Constructor TextDelimited creates a new TextDelimited instance. 437 * 438 * @param fields of type Fields 439 * @param skipHeader of type boolean 440 * @param writeHeader of type boolean 441 * @param delimiter of type String 442 * @param quote of type String 443 * @param types of type Class[] 444 * @param safe of type boolean 445 */ 446 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote", "types", "safe"}) 447 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types, boolean safe ) 448 { 449 this( fields, null, skipHeader, writeHeader, delimiter, quote, types, safe ); 450 } 451 452 /** 453 * Constructor TextDelimited creates a new TextDelimited instance. 454 * 455 * @param fields of type Fields 456 * @param sinkCompression of type Compress 457 * @param delimiter of type String 458 */ 459 @ConstructorProperties({"fields", "sinkCompression", "delimiter"}) 460 public TextDelimited( Fields fields, Compress sinkCompression, String delimiter ) 461 { 462 this( fields, sinkCompression, delimiter, null, null ); 463 } 464 465 /** 466 * Constructor TextDelimited creates a new TextDelimited instance. 467 * 468 * @param fields of type Fields 469 * @param sinkCompression of type Compress 470 * @param hasHeader of type boolean 471 * @param delimiter of type String 472 */ 473 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter"}) 474 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter ) 475 { 476 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, null, null ); 477 } 478 479 /** 480 * Constructor TextDelimited creates a new TextDelimited instance. 481 * 482 * @param fields of type Fields 483 * @param sinkCompression of type Compress 484 * @param skipHeader of type boolean 485 * @param writeHeader of type boolean 486 * @param delimiter of type String 487 */ 488 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter"}) 489 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter ) 490 { 491 this( fields, sinkCompression, skipHeader, writeHeader, delimiter, null, null ); 492 } 493 494 /** 495 * Constructor TextDelimited creates a new TextDelimited instance. 496 * 497 * @param fields of type Fields 498 * @param sinkCompression of type Compress 499 * @param delimiter of type String 500 * @param types of type Class[] 501 */ 502 @ConstructorProperties({"fields", "sinkCompression", "delimiter", "types"}) 503 public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, Class[] types ) 504 { 505 this( fields, sinkCompression, delimiter, null, types ); 506 } 507 508 /** 509 * Constructor TextDelimited creates a new TextDelimited instance. 510 * 511 * @param fields of type Fields 512 * @param sinkCompression of type Compress 513 * @param hasHeader of type boolean 514 * @param delimiter of type String 515 * @param types of type Class[] 516 */ 517 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "types"}) 518 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, Class[] types ) 519 { 520 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, null, types ); 521 } 522 523 /** 524 * Constructor TextDelimited creates a new TextDelimited instance. 525 * 526 * @param fields of type Fields 527 * @param sinkCompression of type Compress 528 * @param skipHeader of type boolean 529 * @param writeHeader of type boolean 530 * @param delimiter of type String 531 * @param types of type Class[] 532 */ 533 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "types"}) 534 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types ) 535 { 536 this( fields, sinkCompression, skipHeader, writeHeader, delimiter, null, types ); 537 } 538 539 /** 540 * Constructor TextDelimited creates a new TextDelimited instance. 541 * 542 * @param fields of type Fields 543 * @param sinkCompression of type Compress 544 * @param delimiter of type String 545 * @param types of type Class[] 546 * @param safe of type boolean 547 */ 548 @ConstructorProperties({"fields", "sinkCompression", "delimiter", "types", "safe"}) 549 public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, Class[] types, boolean safe ) 550 { 551 this( fields, sinkCompression, delimiter, null, types, safe ); 552 } 553 554 /** 555 * Constructor TextDelimited creates a new TextDelimited instance. 556 * 557 * @param fields of type Fields 558 * @param sinkCompression of type Compress 559 * @param hasHeader of type boolean 560 * @param delimiter of type String 561 * @param types of type Class[] 562 * @param safe of type boolean 563 */ 564 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "types", "safe"}) 565 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, Class[] types, boolean safe ) 566 { 567 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, null, types, safe ); 568 } 569 570 /** 571 * Constructor TextDelimited creates a new TextDelimited instance. 572 * 573 * @param fields of type Fields 574 * @param sinkCompression of type Compress 575 * @param hasHeader of type boolean 576 * @param delimiter of type String 577 * @param types of type Class[] 578 * @param safe of type boolean 579 * @param charsetName of type String 580 */ 581 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "types", "safe", "charsetName"}) 582 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, Class[] types, boolean safe, String charsetName ) 583 { 584 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, null, types, safe, charsetName ); 585 } 586 587 /** 588 * Constructor TextDelimited creates a new TextDelimited instance. 589 * 590 * @param fields of type Fields 591 * @param sinkCompression of type Compress 592 * @param skipHeader of type boolean 593 * @param writeHeader of type boolean 594 * @param delimiter of type String 595 * @param types of type Class[] 596 * @param safe of type boolean 597 */ 598 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "types", "safe"}) 599 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types, boolean safe ) 600 { 601 this( fields, sinkCompression, skipHeader, writeHeader, delimiter, null, types, safe ); 602 } 603 604 /** 605 * Constructor TextDelimited creates a new TextDelimited instance. 606 * 607 * @param fields of type Fields 608 * @param delimiter of type String 609 * @param quote of type String 610 */ 611 @ConstructorProperties({"fields", "delimiter", "quote"}) 612 public TextDelimited( Fields fields, String delimiter, String quote ) 613 { 614 this( fields, null, delimiter, quote ); 615 } 616 617 /** 618 * Constructor TextDelimited creates a new TextDelimited instance. 619 * 620 * @param fields of type Fields 621 * @param hasHeader of type boolean 622 * @param delimiter of type String 623 * @param quote of type String 624 */ 625 @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote"}) 626 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote ) 627 { 628 this( fields, null, hasHeader, hasHeader, delimiter, quote ); 629 } 630 631 /** 632 * Constructor TextDelimited creates a new TextDelimited instance. 633 * 634 * @param fields of type Fields 635 * @param skipHeader of type boolean 636 * @param writeHeader of type boolean 637 * @param delimiter of type String 638 * @param quote of type String 639 */ 640 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote"}) 641 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote ) 642 { 643 this( fields, null, skipHeader, writeHeader, delimiter, quote ); 644 } 645 646 /** 647 * Constructor TextDelimited creates a new TextDelimited instance. 648 * 649 * @param fields of type Fields 650 * @param sinkCompression of type Compress 651 * @param delimiter of type String 652 * @param quote of type String 653 */ 654 @ConstructorProperties({"fields", "sinkCompression", "delimiter", "quote"}) 655 public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote ) 656 { 657 this( fields, sinkCompression, false, false, delimiter, true, quote, null, true ); 658 } 659 660 /** 661 * Constructor TextDelimited creates a new TextDelimited instance. 662 * 663 * @param fields of type Fields 664 * @param sinkCompression of type Compress 665 * @param hasHeader of type boolean 666 * @param delimiter of type String 667 * @param quote of type String 668 */ 669 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote"}) 670 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote ) 671 { 672 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, null, true ); 673 } 674 675 /** 676 * Constructor TextDelimited creates a new TextDelimited instance. 677 * 678 * @param fields of type Fields 679 * @param sinkCompression of type Compress 680 * @param hasHeader of type boolean 681 * @param delimiter of type String 682 * @param quote of type String 683 * @param charsetName of type String 684 */ 685 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote", "charsetName"}) 686 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote, String charsetName ) 687 { 688 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, null, true, charsetName ); 689 } 690 691 /** 692 * Constructor TextDelimited creates a new TextDelimited instance. 693 * 694 * @param fields of type Fields 695 * @param sinkCompression of type Compress 696 * @param skipHeader of type boolean 697 * @param writeHeader of type boolean 698 * @param delimiter of type String 699 * @param quote of type String 700 */ 701 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "quote"}) 702 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, String quote ) 703 { 704 this( fields, sinkCompression, skipHeader, writeHeader, delimiter, true, quote, null, true ); 705 } 706 707 /** 708 * Constructor TextDelimited creates a new TextDelimited instance. 709 * 710 * @param fields of type Fields 711 * @param sinkCompression of type Compress 712 * @param delimiter of type String 713 * @param quote of type String 714 * @param types of type Class[] 715 */ 716 @ConstructorProperties({"fields", "sinkCompression", "delimiter", "quote", "types"}) 717 public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote, Class[] types ) 718 { 719 this( fields, sinkCompression, false, false, delimiter, true, quote, types, true ); 720 } 721 722 /** 723 * Constructor TextDelimited creates a new TextDelimited instance. 724 * 725 * @param fields of type Fields 726 * @param sinkCompression of type Compress 727 * @param hasHeader of type boolean 728 * @param delimiter of type String 729 * @param quote of type String 730 * @param types of type Class[] 731 */ 732 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote", "types"}) 733 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote, Class[] types ) 734 { 735 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, types, true ); 736 } 737 738 /** 739 * Constructor TextDelimited creates a new TextDelimited instance. 740 * 741 * @param fields of type Fields 742 * @param sinkCompression of type Compress 743 * @param skipHeader of type boolean 744 * @param writeHeader of type boolean 745 * @param delimiter of type String 746 * @param quote of type String 747 * @param types of type Class[] 748 */ 749 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "quote", "types"}) 750 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types ) 751 { 752 this( fields, sinkCompression, skipHeader, writeHeader, delimiter, true, quote, types, true ); 753 } 754 755 /** 756 * Constructor TextDelimited creates a new TextDelimited instance. 757 * 758 * @param fields of type Fields 759 * @param sinkCompression of type Compress 760 * @param delimiter of type String 761 * @param quote of type String 762 * @param types of type Class[] 763 * @param safe of type boolean 764 */ 765 @ConstructorProperties({"fields", "sinkCompression", "delimiter", "quote", "types", "safe"}) 766 public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote, Class[] types, boolean safe ) 767 { 768 this( fields, sinkCompression, false, false, delimiter, true, quote, types, safe ); 769 } 770 771 /** 772 * Constructor TextDelimited creates a new TextDelimited instance. 773 * 774 * @param fields of type Fields 775 * @param sinkCompression of type Compress 776 * @param hasHeader of type boolean 777 * @param delimiter of type String 778 * @param quote of type String 779 * @param types of type Class[] 780 * @param safe of type boolean 781 */ 782 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote", "types", "safe"}) 783 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe ) 784 { 785 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, types, safe ); 786 } 787 788 /** 789 * Constructor TextDelimited creates a new TextDelimited instance. 790 * 791 * @param fields of type Fields 792 * @param sinkCompression of type Compress 793 * @param skipHeader of type boolean 794 * @param writeHeader of type boolean 795 * @param delimiter of type String 796 * @param quote of type String 797 * @param types of type Class[] 798 * @param safe of type boolean 799 */ 800 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "quote", "types", 801 "safe"}) 802 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types, boolean safe ) 803 { 804 this( fields, sinkCompression, skipHeader, writeHeader, delimiter, true, quote, types, safe ); 805 } 806 807 /** 808 * Constructor TextDelimited creates a new TextDelimited instance. 809 * 810 * @param fields of type Fields 811 * @param sinkCompression of type Compress 812 * @param skipHeader of type boolean 813 * @param delimiter of type String 814 * @param strict of type boolean 815 * @param quote of type String 816 * @param types of type Class[] 817 * @param safe of type boolean 818 */ 819 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "strict", "quote", 820 "types", "safe"}) 821 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe ) 822 { 823 this( fields, sinkCompression, skipHeader, writeHeader, delimiter, strict, quote, types, safe, DEFAULT_CHARSET ); 824 } 825 826 /** 827 * Constructor TextDelimited creates a new TextDelimited instance. 828 * 829 * @param fields of type Fields 830 * @param sinkCompression of type Compress 831 * @param skipHeader of type boolean 832 * @param delimiter of type String 833 * @param strict of type boolean 834 * @param quote of type String 835 * @param types of type Class[] 836 * @param safe of type boolean 837 * @param charsetName of type String 838 */ 839 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "strict", "quote", 840 "types", "safe", "charsetName"}) 841 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe, String charsetName ) 842 { 843 this( fields, sinkCompression, skipHeader, writeHeader, charsetName, new DelimitedParser( delimiter, quote, types, strict, safe ) ); 844 } 845 846 /** 847 * Constructor TextDelimited creates a new TextDelimited instance. 848 * 849 * @param fields of type Fields 850 * @param writeHeader of type boolean 851 * @param delimitedParser of type DelimitedParser 852 */ 853 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimitedParser"}) 854 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, DelimitedParser delimitedParser ) 855 { 856 this( fields, null, skipHeader, writeHeader, null, delimitedParser ); 857 } 858 859 /** 860 * Constructor TextDelimited creates a new TextDelimited instance. 861 * 862 * @param fields of type Fields 863 * @param hasHeader of type boolean 864 * @param delimitedParser of type DelimitedParser 865 */ 866 @ConstructorProperties({"fields", "hasHeader", "delimitedParser"}) 867 public TextDelimited( Fields fields, boolean hasHeader, DelimitedParser delimitedParser ) 868 { 869 this( fields, null, hasHeader, hasHeader, null, delimitedParser ); 870 } 871 872 /** 873 * Constructor TextDelimited creates a new TextDelimited instance. 874 * 875 * @param fields of type Fields 876 * @param writeHeader of type boolean 877 * @param delimitedParser of type DelimitedParser 878 */ 879 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimitedParser"}) 880 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, DelimitedParser delimitedParser ) 881 { 882 this( fields, sinkCompression, skipHeader, writeHeader, null, delimitedParser ); 883 } 884 885 /** 886 * Constructor TextDelimited creates a new TextDelimited instance. 887 * 888 * @param fields of type Fields 889 * @param sinkCompression of type Compress 890 * @param skipHeader of type boolean 891 * @param writeHeader of type boolean 892 * @param charsetName of type String 893 * @param delimitedParser of type DelimitedParser 894 */ 895 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "charsetName", "delimitedParser"}) 896 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String charsetName, DelimitedParser delimitedParser ) 897 { 898 super( sinkCompression ); 899 900 this.delimitedParser = delimitedParser; 901 902 // normalizes ALL and UNKNOWN 903 setSinkFields( fields ); 904 setSourceFields( fields ); 905 906 this.skipHeader = skipHeader; 907 this.writeHeader = writeHeader; 908 909 // throws an exception if not found 910 setCharsetName( charsetName ); 911 } 912 913 /** 914 * Method getDelimiter returns the delimiter used to parse fields from the current line of text. 915 * 916 * @return a String 917 */ 918 @Property(name = "delimiter", visibility = Visibility.PUBLIC) 919 @PropertyDescription("The delimiter used to separate fields.") 920 public String getDelimiter() 921 { 922 return delimitedParser.getDelimiter(); 923 } 924 925 /** 926 * Method getQuote returns the quote string, if any, used to encapsulate each field in a line to delimited text. 927 * 928 * @return a String 929 */ 930 @Property(name = "quote", visibility = Visibility.PUBLIC) 931 @PropertyDescription("The string used for quoting.") 932 public String getQuote() 933 { 934 return delimitedParser.getQuote(); 935 } 936 937 @Override 938 public boolean isSymmetrical() 939 { 940 return super.isSymmetrical() && skipHeader == writeHeader; 941 } 942 943 @Override 944 public void setSinkFields( Fields sinkFields ) 945 { 946 super.setSourceFields( sinkFields ); 947 super.setSinkFields( sinkFields ); 948 949 if( delimitedParser != null ) 950 delimitedParser.reset( getSourceFields(), getSinkFields() ); 951 } 952 953 @Override 954 public void setSourceFields( Fields sourceFields ) 955 { 956 super.setSourceFields( sourceFields ); 957 super.setSinkFields( sourceFields ); 958 959 if( delimitedParser != null ) 960 delimitedParser.reset( getSourceFields(), getSinkFields() ); 961 } 962 963 @Override 964 public Fields retrieveSourceFields( FlowProcess<? extends Configuration> flowProcess, Tap tap ) 965 { 966 if( !skipHeader || !getSourceFields().isUnknown() ) 967 return getSourceFields(); 968 969 // no need to open them all 970 if( tap instanceof CompositeTap ) 971 tap = (Tap) ( (CompositeTap) tap ).getChildTaps().next(); 972 973 // should revert to file:// (Lfs) if tap is Lfs 974 tap = new Hfs( new TextLine( new Fields( "line" ), charsetName ), tap.getFullIdentifier( flowProcess ) ); 975 976 setSourceFields( delimitedParser.parseFirstLine( flowProcess, tap ) ); 977 978 return getSourceFields(); 979 } 980 981 @Override 982 public void presentSourceFields( FlowProcess<? extends Configuration> flowProcess, Tap tap, Fields fields ) 983 { 984 presentSourceFieldsInternal( fields ); 985 } 986 987 @Override 988 public void presentSinkFields( FlowProcess<? extends Configuration> flowProcess, Tap tap, Fields fields ) 989 { 990 presentSinkFieldsInternal( fields ); 991 } 992 993 @Override 994 public void sourcePrepare( FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall ) 995 { 996 super.sourcePrepare( flowProcess, sourceCall ); 997 998 sourceCall.getIncomingEntry().setTuple( TupleViews.createObjectArray() ); 999 } 1000 1001 @Override 1002 public boolean source( FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall ) throws IOException 1003 { 1004 Object[] context = sourceCall.getContext(); 1005 1006 if( !sourceCall.getInput().next( context[ 0 ], context[ 1 ] ) ) 1007 return false; 1008 1009 if( skipHeader && ( (LongWritable) context[ 0 ] ).get() == 0 ) 1010 { 1011 if( !sourceCall.getInput().next( context[ 0 ], context[ 1 ] ) ) 1012 return false; 1013 } 1014 1015 // delegate coercion to delimitedParser for robustness 1016 Object[] split = delimitedParser.parseLine( makeEncodedString( context ) ); 1017 Tuple tuple = sourceCall.getIncomingEntry().getTuple(); 1018 1019 TupleViews.reset( tuple, split ); 1020 1021 return true; 1022 } 1023 1024 @Override 1025 public void sinkPrepare( FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall ) throws IOException 1026 { 1027 sinkCall.setContext( new Object[ 3 ] ); 1028 1029 sinkCall.getContext()[ 0 ] = new Text(); 1030 sinkCall.getContext()[ 1 ] = new StringBuilder( 4 * 1024 ); 1031 sinkCall.getContext()[ 2 ] = Charset.forName( charsetName ); 1032 1033 if( writeHeader ) 1034 writeHeader( sinkCall ); 1035 } 1036 1037 protected void writeHeader( SinkCall<Object[], OutputCollector> sinkCall ) throws IOException 1038 { 1039 Fields fields = sinkCall.getOutgoingEntry().getFields(); 1040 1041 Text text = (Text) sinkCall.getContext()[ 0 ]; 1042 StringBuilder line = (StringBuilder) sinkCall.getContext()[ 1 ]; 1043 Charset charset = (Charset) sinkCall.getContext()[ 2 ]; 1044 1045 line = (StringBuilder) delimitedParser.joinFirstLine( fields, line ); 1046 1047 text.set( line.toString().getBytes( charset ) ); 1048 1049 sinkCall.getOutput().collect( null, text ); 1050 1051 line.setLength( 0 ); 1052 } 1053 1054 @Override 1055 public void sink( FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall ) throws IOException 1056 { 1057 TupleEntry tupleEntry = sinkCall.getOutgoingEntry(); 1058 1059 Text text = (Text) sinkCall.getContext()[ 0 ]; 1060 StringBuilder line = (StringBuilder) sinkCall.getContext()[ 1 ]; 1061 Charset charset = (Charset) sinkCall.getContext()[ 2 ]; 1062 1063 Iterable<String> strings = tupleEntry.asIterableOf( String.class ); 1064 1065 line = (StringBuilder) delimitedParser.joinLine( strings, line ); 1066 1067 text.set( line.toString().getBytes( charset ) ); 1068 1069 sinkCall.getOutput().collect( null, text ); 1070 1071 line.setLength( 0 ); 1072 } 1073 1074 @Override 1075 public String getExtension() 1076 { 1077 switch( getDelimiter().trim() ) 1078 { 1079 case "\t": 1080 return "tsv"; 1081 1082 case ",": 1083 return "csv"; 1084 } 1085 1086 return "txt"; 1087 } 1088 } 1089