001/* 002 * Copyright (c) 2016-2017 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.scheme.local; 023 024import java.beans.ConstructorProperties; 025import java.io.FileOutputStream; 026import java.io.IOException; 027import java.io.InputStream; 028import java.io.InputStreamReader; 029import java.io.LineNumberReader; 030import java.io.OutputStream; 031import java.io.OutputStreamWriter; 032import java.io.PrintWriter; 033import java.io.UnsupportedEncodingException; 034import java.nio.charset.Charset; 035import java.util.Properties; 036 037import cascading.flow.FlowProcess; 038import cascading.management.annotation.Property; 039import cascading.management.annotation.PropertyDescription; 040import cascading.management.annotation.Visibility; 041import cascading.scheme.FileFormat; 042import cascading.scheme.SinkCall; 043import cascading.scheme.SourceCall; 044import cascading.scheme.util.DelimitedParser; 045import cascading.tap.CompositeTap; 046import cascading.tap.SinkMode; 047import cascading.tap.Tap; 048import cascading.tap.TapException; 049import cascading.tap.local.FileTap; 050import cascading.tuple.Fields; 051import cascading.tuple.Tuple; 052import cascading.tuple.TupleEntry; 053import cascading.tuple.util.TupleViews; 054 055/** 056 * Class TextDelimited provides direct support for delimited text files, like 057 * TAB (\t) or COMMA (,) delimited files. It also optionally allows for quoted values. 058 * <p> 059 * TextDelimited may also be used to skip the "header" in a file, where the header is defined as the very first line 060 * in every input file. That is, if the byte offset of the current line from the input is zero (0), that line will 061 * be skipped. 062 * <p> 063 * It is assumed if sink/source {@code fields} is set to either {@link Fields#ALL} or {@link Fields#UNKNOWN} and 064 * {@code skipHeader} or {@code hasHeader} is {@code true}, the field names will be retrieved from the header of the 065 * file and used during planning. The header will parsed with the same rules as the body of the file. 066 * <p> 067 * By default headers are not skipped. 068 * <p> 069 * TextDelimited may also be used to write a "header" in a file. The fields names for the header are taken directly 070 * from the declared fields. Or if the declared fields are {@link Fields#ALL} or {@link Fields#UNKNOWN}, the 071 * resolved field names will be used, if any. 072 * <p> 073 * By default headers are not written. 074 * <p> 075 * If {@code hasHeaders} is set to {@code true} on a constructor, both {@code skipHeader} and {@code writeHeader} will 076 * be set to {@code true}. 077 * <p> 078 * By default this {@link cascading.scheme.Scheme} is both {@code strict} and {@code safe}. 079 * <p> 080 * Strict meaning if a line of text does not parse into the expected number of fields, this class will throw a 081 * {@link TapException}. If strict is {@code false}, then {@link Tuple} will be returned with {@code null} values 082 * for the missing fields. 083 * <p> 084 * Safe meaning if a field cannot be coerced into an expected type, a {@code null} will be used for the value. 085 * If safe is {@code false}, a {@link TapException} will be thrown. 086 * <p> 087 * Also by default, {@code quote} strings are not searched for to improve processing speed. If a file is 088 * COMMA delimited but may have COMMA's in a value, the whole value should be surrounded by the quote string, typically 089 * double quotes ({@literal "}). 090 * <p> 091 * Note all empty fields in a line will be returned as {@code null} unless coerced into a new type. 092 * <p> 093 * This Scheme may source/sink {@link Fields#ALL}, when given on the constructor the new instance will automatically 094 * default to strict == false as the number of fields parsed are arbitrary or unknown. A type array may not be given 095 * either, so all values will be returned as Strings. 096 * <p> 097 * By default, all text is encoded/decoded as UTF-8. This can be changed via the {@code charsetName} constructor 098 * argument. 099 * <p> 100 * To override field and line parsing behaviors, sub-class {@link DelimitedParser} or provide a 101 * {@link cascading.scheme.util.FieldTypeResolver} implementation. 102 * <p> 103 * Note that there should be no expectation that TextDelimited, or specifically {@link DelimitedParser}, can handle 104 * all delimited and quoted combinations reliably. Attempting to do so would impair its performance and maintainability. 105 * <p> 106 * Further, it can be safely said any corrupted files will not be supported for obvious reasons. Corrupted files may 107 * result in exceptions or could cause edge cases in the underlying java regular expression engine. 108 * <p> 109 * A large part of Cascading was designed to help users cleans data. Thus the recommendation is to create Flows that 110 * are responsible for cleansing large data-sets when faced with the problem. 111 * <p> 112 * DelimitedParser maybe sub-classed and extended if necessary. 113 * <p> 114 * In order to read or write a compressed files, pass a {@link cascading.scheme.local.CompressorScheme.Compressor} 115 * instance to the appropriate constructors. See {@link Compressors} for provided compression algorithms. 116 * 117 * @see TextLine 118 * @see Compressors 119 */ 120public class TextDelimited extends CompressorScheme<LineNumberReader, PrintWriter> implements FileFormat 121 { 122 public static final String DEFAULT_CHARSET = "UTF-8"; 123 124 private final boolean skipHeader; 125 private final boolean writeHeader; 126 private final DelimitedParser delimitedParser; 127 private String charsetName = DEFAULT_CHARSET; 128 129 /** 130 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 131 * {@link Fields#ALL} and using TAB as the default delimiter. 132 * <p> 133 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 134 * with a {@link cascading.pipe.Checkpoint} Tap. 135 */ 136 public TextDelimited() 137 { 138 this( Fields.ALL ); 139 } 140 141 /** 142 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 143 * {@link Fields#ALL} and using TAB as the default delimiter. 144 * <p> 145 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 146 * with a {@link cascading.pipe.Checkpoint} Tap. 147 * 148 * @param hasHeader 149 * @param delimiter 150 */ 151 @ConstructorProperties({"hasHeader", "delimiter"}) 152 public TextDelimited( boolean hasHeader, String delimiter ) 153 { 154 this( Fields.ALL, hasHeader, delimiter, null, (Class[]) null ); 155 } 156 157 /** 158 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 159 * {@link Fields#ALL} and using TAB as the default delimiter. 160 * <p> 161 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 162 * with a {@link cascading.pipe.Checkpoint} Tap. 163 * 164 * @param hasHeader 165 * @param delimiter 166 * @param quote 167 */ 168 @ConstructorProperties({"hasHeader", "delimiter", "quote"}) 169 public TextDelimited( boolean hasHeader, String delimiter, String quote ) 170 { 171 this( Fields.ALL, hasHeader, delimiter, quote, (Class[]) null ); 172 } 173 174 /** 175 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 176 * {@link Fields#ALL} and using the given delimitedParser instance for parsing. 177 * <p> 178 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 179 * with a {@link cascading.pipe.Checkpoint} Tap. 180 * 181 * @param hasHeader 182 * @param delimitedParser 183 */ 184 @ConstructorProperties({"hasHeader", "delimitedParser"}) 185 public TextDelimited( boolean hasHeader, DelimitedParser delimitedParser ) 186 { 187 this( Fields.ALL, hasHeader, hasHeader, delimitedParser ); 188 } 189 190 /** 191 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 192 * {@link Fields#ALL} and using the given delimitedParser instance for parsing. 193 * <p> 194 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 195 * with a {@link cascading.pipe.Checkpoint} Tap. 196 * <p> 197 * This constructor will set {@code skipHeader} and {@code writeHeader} values to true. 198 * 199 * @param delimitedParser 200 */ 201 @ConstructorProperties({"delimitedParser"}) 202 public TextDelimited( DelimitedParser delimitedParser ) 203 { 204 this( Fields.ALL, true, true, delimitedParser ); 205 } 206 207 /** 208 * Constructor TextDelimited creates a new TextDelimited instance with TAB as the default delimiter. 209 * 210 * @param fields of type Fields 211 */ 212 @ConstructorProperties({"fields"}) 213 public TextDelimited( Fields fields ) 214 { 215 this( fields, "\t", null, null ); 216 } 217 218 /** 219 * Constructor TextDelimited creates a new TextDelimited instance. 220 * 221 * @param fields of type Fields 222 * @param delimiter of type String 223 */ 224 @ConstructorProperties({"fields", "delimiter"}) 225 public TextDelimited( Fields fields, String delimiter ) 226 { 227 this( fields, delimiter, null, null ); 228 } 229 230 /** 231 * Constructor TextDelimited creates a new TextDelimited instance. 232 * 233 * @param fields of type Fields 234 * @param hasHeader of type boolean 235 * @param delimiter of type String 236 */ 237 @ConstructorProperties({"fields", "hasHeader", "delimiter"}) 238 public TextDelimited( Fields fields, boolean hasHeader, String delimiter ) 239 { 240 this( fields, hasHeader, hasHeader, delimiter, null, null ); 241 } 242 243 /** 244 * Constructor TextDelimited creates a new TextDelimited instance. 245 * 246 * @param fields of type Fields 247 * @param skipHeader of type boolean 248 * @param delimiter of type String 249 */ 250 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter"}) 251 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter ) 252 { 253 this( fields, skipHeader, writeHeader, delimiter, null, null ); 254 } 255 256 /** 257 * Constructor TextDelimited creates a new TextDelimited instance. 258 * 259 * @param fields of type Fields 260 * @param delimiter of type String 261 * @param types of type Class[] 262 */ 263 @ConstructorProperties({"fields", "delimiter", "types"}) 264 public TextDelimited( Fields fields, String delimiter, Class[] types ) 265 { 266 this( fields, delimiter, null, types ); 267 } 268 269 /** 270 * Constructor TextDelimited creates a new TextDelimited instance. 271 * 272 * @param fields of type Fields 273 * @param hasHeader of type boolean 274 * @param delimiter of type String 275 * @param types of type Class[] 276 */ 277 @ConstructorProperties({"fields", "hasHeader", "delimiter", "types"}) 278 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, Class[] types ) 279 { 280 this( fields, hasHeader, hasHeader, delimiter, null, types ); 281 } 282 283 /** 284 * Constructor TextDelimited creates a new TextDelimited instance. 285 * 286 * @param fields of type Fields 287 * @param skipHeader of type boolean 288 * @param writeHeader of type boolean 289 * @param delimiter of type String 290 * @param types of type Class[] 291 */ 292 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "types"}) 293 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types ) 294 { 295 this( fields, skipHeader, writeHeader, delimiter, null, types ); 296 } 297 298 /** 299 * Constructor TextDelimited creates a new TextDelimited instance. 300 * 301 * @param fields of type Fields 302 * @param delimiter of type String 303 * @param quote of type String 304 * @param types of type Class[] 305 */ 306 @ConstructorProperties({"fields", "delimiter", "quote", "types"}) 307 public TextDelimited( Fields fields, String delimiter, String quote, Class[] types ) 308 { 309 this( fields, false, delimiter, quote, types ); 310 } 311 312 /** 313 * Constructor TextDelimited creates a new TextDelimited instance. 314 * 315 * @param fields of type Fields 316 * @param hasHeader of type boolean 317 * @param delimiter of type String 318 * @param quote of type String 319 * @param types of type Class[] 320 */ 321 @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types"}) 322 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types ) 323 { 324 this( fields, hasHeader, hasHeader, delimiter, quote, types, true ); 325 } 326 327 /** 328 * Constructor TextDelimited creates a new TextDelimited instance. 329 * 330 * @param fields of type Fields 331 * @param skipHeader of type boolean 332 * @param writeHeader of type boolean 333 * @param delimiter of type String 334 * @param quote of type String 335 * @param types of type Class[] 336 */ 337 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote", "types"}) 338 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types ) 339 { 340 this( fields, skipHeader, writeHeader, delimiter, quote, types, true ); 341 } 342 343 /** 344 * Constructor TextDelimited creates a new TextDelimited instance. 345 * 346 * @param fields of type Fields 347 * @param delimiter of type String 348 * @param quote of type String 349 * @param types of type Class[] 350 * @param safe of type boolean 351 */ 352 @ConstructorProperties({"fields", "delimiter", "quote", "types", "safe"}) 353 public TextDelimited( Fields fields, String delimiter, String quote, Class[] types, boolean safe ) 354 { 355 this( fields, false, delimiter, quote, types, safe ); 356 } 357 358 /** 359 * Constructor TextDelimited creates a new TextDelimited instance. 360 * 361 * @param fields of type Fields 362 * @param hasHeader of type boolean 363 * @param delimiter of type String 364 * @param quote of type String 365 * @param types of type Class[] 366 * @param safe of type boolean 367 */ 368 @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types", "safe"}) 369 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe ) 370 { 371 this( fields, hasHeader, hasHeader, delimiter, true, quote, types, safe ); 372 } 373 374 /** 375 * Constructor TextDelimited creates a new TextDelimited instance. 376 * 377 * @param fields of type Fields 378 * @param hasHeader of type boolean 379 * @param delimiter of type String 380 * @param quote of type String 381 * @param types of type Class[] 382 * @param safe of type boolean 383 * @param charsetName of type String 384 */ 385 @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types", "safe", "charsetName"}) 386 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe, String charsetName ) 387 { 388 this( fields, hasHeader, hasHeader, delimiter, true, quote, types, safe, charsetName ); 389 } 390 391 /** 392 * Constructor TextDelimited creates a new TextDelimited instance. 393 * 394 * @param fields of type Fields 395 * @param skipHeader of type boolean 396 * @param writeHeader of type boolean 397 * @param delimiter of type String 398 * @param quote of type String 399 * @param types of type Class[] 400 * @param safe of type boolean 401 */ 402 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote", "types", "safe"}) 403 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types, boolean safe ) 404 { 405 this( fields, skipHeader, writeHeader, delimiter, true, quote, types, safe ); 406 } 407 408 /** 409 * Constructor TextDelimited creates a new TextDelimited instance. 410 * 411 * @param fields of type Fields 412 * @param delimiter of type String 413 * @param quote of type String 414 */ 415 @ConstructorProperties({"fields", "delimiter", "quote"}) 416 public TextDelimited( Fields fields, String delimiter, String quote ) 417 { 418 this( fields, false, delimiter, quote, null, true ); 419 } 420 421 /** 422 * Constructor TextDelimited creates a new TextDelimited instance. 423 * 424 * @param fields of type Fields 425 * @param hasHeader of type boolean 426 * @param delimiter of type String 427 * @param quote of type String 428 */ 429 @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote"}) 430 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote ) 431 { 432 this( fields, hasHeader, delimiter, quote, null, true ); 433 } 434 435 /** 436 * Constructor TextDelimited creates a new TextDelimited instance. 437 * 438 * @param fields of type Fields 439 * @param hasHeader of type boolean 440 * @param delimiter of type String 441 * @param quote of type String 442 * @param charsetName of type String 443 */ 444 @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "charsetName"}) 445 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, String charsetName ) 446 { 447 this( fields, hasHeader, delimiter, quote, null, true, charsetName ); 448 } 449 450 /** 451 * Constructor TextDelimited creates a new TextDelimited instance. 452 * 453 * @param fields of type Fields 454 * @param skipHeader of type boolean 455 * @param writeHeader of type boolean 456 * @param delimiter of type String 457 * @param strict of type boolean 458 * @param quote of type String 459 * @param types of type Class[] 460 * @param safe of type boolean 461 */ 462 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "strict", "quote", "types", "safe"}) 463 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe ) 464 { 465 this( fields, skipHeader, writeHeader, delimiter, strict, quote, types, safe, DEFAULT_CHARSET ); 466 } 467 468 /** 469 * Constructor TextDelimited creates a new TextDelimited instance. 470 * 471 * @param fields of type Fields 472 * @param skipHeader of type boolean 473 * @param writeHeader of type boolean 474 * @param delimiter of type String 475 * @param strict of type boolean 476 * @param quote of type String 477 * @param types of type Class[] 478 * @param safe of type boolean 479 * @param charsetName of type String 480 */ 481 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "strict", "quote", "types", "safe", 482 "charsetName"}) 483 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe, String charsetName ) 484 { 485 this( fields, skipHeader, writeHeader, charsetName, new DelimitedParser( delimiter, quote, types, strict, safe ) ); 486 } 487 488 /** 489 * Constructor TextDelimited creates a new TextDelimited instance. 490 * 491 * @param fields of type Fields 492 * @param writeHeader of type boolean 493 * @param delimitedParser of type DelimitedParser 494 */ 495 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimitedParser"}) 496 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, DelimitedParser delimitedParser ) 497 { 498 this( fields, skipHeader, writeHeader, null, delimitedParser ); 499 } 500 501 /** 502 * Constructor TextDelimited creates a new TextDelimited instance. 503 * 504 * @param fields of type Fields 505 * @param hasHeader of type boolean 506 * @param delimitedParser of type DelimitedParser 507 */ 508 @ConstructorProperties({"fields", "hasHeader", "delimitedParser"}) 509 public TextDelimited( Fields fields, boolean hasHeader, DelimitedParser delimitedParser ) 510 { 511 this( fields, hasHeader, hasHeader, null, delimitedParser ); 512 } 513 514 /** 515 * Constructor TextDelimited creates a new TextDelimited instance. 516 * 517 * @param fields of type Fields 518 * @param writeHeader of type boolean 519 * @param charsetName of type String 520 * @param delimitedParser of type DelimitedParser 521 */ 522 @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "charsetName", "delimitedParser"}) 523 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String charsetName, DelimitedParser delimitedParser ) 524 { 525 this( fields, null, skipHeader, writeHeader, charsetName, delimitedParser ); 526 } 527 528 /** 529 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 530 * {@link Fields#ALL} and using TAB as the default delimiter. 531 * <p> 532 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 533 * with a {@link cascading.pipe.Checkpoint} Tap. 534 * 535 * @param compressor of type Compressor, see {@link Compressors} 536 */ 537 @ConstructorProperties("compressor") 538 public TextDelimited( Compressor compressor ) 539 { 540 this( Fields.ALL, compressor ); 541 } 542 543 /** 544 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 545 * {@link Fields#ALL} and using TAB as the default delimiter. 546 * <p> 547 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 548 * with a {@link cascading.pipe.Checkpoint} Tap. 549 * 550 * @param compressor of type Compressor, see {@link Compressors} 551 * @param hasHeader 552 * @param delimiter 553 */ 554 @ConstructorProperties({"compressor", "hasHeader", "delimiter"}) 555 public TextDelimited( Compressor compressor, boolean hasHeader, String delimiter ) 556 { 557 this( Fields.ALL, compressor, hasHeader, delimiter, null, (Class[]) null ); 558 } 559 560 /** 561 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 562 * {@link Fields#ALL} and using TAB as the default delimiter. 563 * <p> 564 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 565 * with a {@link cascading.pipe.Checkpoint} Tap. 566 * 567 * @param compressor of type Compressor, see {@link Compressors} 568 * @param hasHeader 569 * @param delimiter 570 * @param quote 571 */ 572 @ConstructorProperties({"compressor", "hasHeader", "delimiter", "quote"}) 573 public TextDelimited( Compressor compressor, boolean hasHeader, String delimiter, String quote ) 574 { 575 this( Fields.ALL, compressor, hasHeader, delimiter, quote, (Class[]) null ); 576 } 577 578 /** 579 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 580 * {@link Fields#ALL} and using the given delimitedParser instance for parsing. 581 * <p> 582 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 583 * with a {@link cascading.pipe.Checkpoint} Tap. 584 * 585 * @param compressor of type Compressor, see {@link Compressors} 586 * @param hasHeader 587 * @param delimitedParser 588 */ 589 @ConstructorProperties({"compressor", "hasHeader", "delimitedParser"}) 590 public TextDelimited( Compressor compressor, boolean hasHeader, DelimitedParser delimitedParser ) 591 { 592 this( Fields.ALL, compressor, hasHeader, hasHeader, delimitedParser ); 593 } 594 595 /** 596 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 597 * {@link Fields#ALL} and using the given delimitedParser instance for parsing. 598 * <p> 599 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 600 * with a {@link cascading.pipe.Checkpoint} Tap. 601 * <p> 602 * This constructor will set {@code skipHeader} and {@code writeHeader} values to true. 603 * 604 * @param compressor of type Compressor, see {@link Compressors} 605 * @param delimitedParser 606 */ 607 @ConstructorProperties({"compressor", "delimitedParser"}) 608 public TextDelimited( Compressor compressor, DelimitedParser delimitedParser ) 609 { 610 this( Fields.ALL, compressor, true, true, delimitedParser ); 611 } 612 613 /** 614 * Constructor TextDelimited creates a new TextDelimited instance with TAB as the default delimiter. 615 * 616 * @param fields of type Fields 617 * @param compressor of type Compressor, see {@link Compressors} 618 */ 619 @ConstructorProperties({"fields", "compressor"}) 620 public TextDelimited( Fields fields, Compressor compressor ) 621 { 622 this( fields, compressor, "\t", null, null ); 623 } 624 625 /** 626 * Constructor TextDelimited creates a new TextDelimited instance. 627 * 628 * @param fields of type Fields 629 * @param compressor of type Compressor, see {@link Compressors} 630 * @param delimiter of type String 631 */ 632 @ConstructorProperties({"fields", "compressor", "delimiter"}) 633 public TextDelimited( Fields fields, Compressor compressor, String delimiter ) 634 { 635 this( fields, compressor, delimiter, null, null ); 636 } 637 638 /** 639 * Constructor TextDelimited creates a new TextDelimited instance. 640 * 641 * @param fields of type Fields 642 * @param compressor of type Compressor, see {@link Compressors} 643 * @param hasHeader of type boolean 644 * @param delimiter of type String 645 */ 646 @ConstructorProperties({"fields", "compressor", "hasHeader", "delimiter"}) 647 public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, String delimiter ) 648 { 649 this( fields, compressor, hasHeader, hasHeader, delimiter, null, null ); 650 } 651 652 /** 653 * Constructor TextDelimited creates a new TextDelimited instance. 654 * 655 * @param fields of type Fields 656 * @param compressor of type Compressor, see {@link Compressors} 657 * @param skipHeader of type boolean 658 * @param delimiter of type String 659 */ 660 @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "delimiter"}) 661 public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, String delimiter ) 662 { 663 this( fields, compressor, skipHeader, writeHeader, delimiter, null, null ); 664 } 665 666 /** 667 * Constructor TextDelimited creates a new TextDelimited instance. 668 * 669 * @param fields of type Fields 670 * @param compressor of type Compressor, see {@link Compressors} 671 * @param delimiter of type String 672 * @param types of type Class[] 673 */ 674 @ConstructorProperties({"fields", "compressor", "delimiter", "types"}) 675 public TextDelimited( Fields fields, Compressor compressor, String delimiter, Class[] types ) 676 { 677 this( fields, compressor, delimiter, null, types ); 678 } 679 680 /** 681 * Constructor TextDelimited creates a new TextDelimited instance. 682 * 683 * @param fields of type Fields 684 * @param compressor of type Compressor, see {@link Compressors} 685 * @param hasHeader of type boolean 686 * @param delimiter of type String 687 * @param types of type Class[] 688 */ 689 @ConstructorProperties({"fields", "compressor", "hasHeader", "delimiter", "types"}) 690 public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, String delimiter, Class[] types ) 691 { 692 this( fields, compressor, hasHeader, hasHeader, delimiter, null, types ); 693 } 694 695 /** 696 * Constructor TextDelimited creates a new TextDelimited instance. 697 * 698 * @param fields of type Fields 699 * @param compressor of type Compressor, see {@link Compressors} 700 * @param skipHeader of type boolean 701 * @param writeHeader of type boolean 702 * @param delimiter of type String 703 * @param types of type Class[] 704 */ 705 @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "delimiter", "types"}) 706 public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types ) 707 { 708 this( fields, compressor, skipHeader, writeHeader, delimiter, null, types ); 709 } 710 711 /** 712 * Constructor TextDelimited creates a new TextDelimited instance. 713 * 714 * @param fields of type Fields 715 * @param compressor of type Compressor, see {@link Compressors} 716 * @param delimiter of type String 717 * @param quote of type String 718 * @param types of type Class[] 719 */ 720 @ConstructorProperties({"fields", "compressor", "delimiter", "quote", "types"}) 721 public TextDelimited( Fields fields, Compressor compressor, String delimiter, String quote, Class[] types ) 722 { 723 this( fields, compressor, false, delimiter, quote, types ); 724 } 725 726 /** 727 * Constructor TextDelimited creates a new TextDelimited instance. 728 * 729 * @param fields of type Fields 730 * @param compressor of type Compressor, see {@link Compressors} 731 * @param hasHeader of type boolean 732 * @param delimiter of type String 733 * @param quote of type String 734 * @param types of type Class[] 735 */ 736 @ConstructorProperties({"fields", "compressor", "hasHeader", "delimiter", "quote", "types"}) 737 public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, String delimiter, String quote, Class[] types ) 738 { 739 this( fields, compressor, hasHeader, hasHeader, delimiter, quote, types, true ); 740 } 741 742 /** 743 * Constructor TextDelimited creates a new TextDelimited instance. 744 * 745 * @param fields of type Fields 746 * @param compressor of type Compressor, see {@link Compressors} 747 * @param skipHeader of type boolean 748 * @param writeHeader of type boolean 749 * @param delimiter of type String 750 * @param quote of type String 751 * @param types of type Class[] 752 */ 753 @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "delimiter", "quote", "types"}) 754 public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types ) 755 { 756 this( fields, compressor, skipHeader, writeHeader, delimiter, quote, types, true ); 757 } 758 759 /** 760 * Constructor TextDelimited creates a new TextDelimited instance. 761 * 762 * @param fields of type Fields 763 * @param compressor of type Compressor, see {@link Compressors} 764 * @param delimiter of type String 765 * @param quote of type String 766 * @param types of type Class[] 767 * @param safe of type boolean 768 */ 769 @ConstructorProperties({"fields", "compressor", "delimiter", "quote", "types", "safe"}) 770 public TextDelimited( Fields fields, Compressor compressor, String delimiter, String quote, Class[] types, boolean safe ) 771 { 772 this( fields, compressor, false, delimiter, quote, types, safe ); 773 } 774 775 /** 776 * Constructor TextDelimited creates a new TextDelimited instance. 777 * 778 * @param fields of type Fields 779 * @param compressor of type Compressor, see {@link Compressors} 780 * @param hasHeader of type boolean 781 * @param delimiter of type String 782 * @param quote of type String 783 * @param types of type Class[] 784 * @param safe of type boolean 785 */ 786 @ConstructorProperties({"fields", "compressor", "hasHeader", "delimiter", "quote", "types", "safe"}) 787 public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe ) 788 { 789 this( fields, compressor, hasHeader, hasHeader, delimiter, true, quote, types, safe ); 790 } 791 792 /** 793 * Constructor TextDelimited creates a new TextDelimited instance. 794 * 795 * @param fields of type Fields 796 * @param compressor of type Compressor, see {@link Compressors} 797 * @param hasHeader of type boolean 798 * @param delimiter of type String 799 * @param quote of type String 800 * @param types of type Class[] 801 * @param safe of type boolean 802 * @param charsetName of type String 803 */ 804 @ConstructorProperties({"fields", "compressor", "hasHeader", "delimiter", "quote", "types", "safe", "charsetName"}) 805 public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe, String charsetName ) 806 { 807 this( fields, compressor, hasHeader, hasHeader, delimiter, true, quote, types, safe, charsetName ); 808 } 809 810 /** 811 * Constructor TextDelimited creates a new TextDelimited instance. 812 * 813 * @param fields of type Fields 814 * @param compressor of type Compressor, see {@link Compressors} 815 * @param skipHeader of type boolean 816 * @param writeHeader of type boolean 817 * @param delimiter of type String 818 * @param quote of type String 819 * @param types of type Class[] 820 * @param safe of type boolean 821 */ 822 @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "delimiter", "quote", "types", "safe"}) 823 public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types, boolean safe ) 824 { 825 this( fields, compressor, skipHeader, writeHeader, delimiter, true, quote, types, safe ); 826 } 827 828 /** 829 * Constructor TextDelimited creates a new TextDelimited instance. 830 * 831 * @param fields of type Fields 832 * @param compressor of type Compressor, see {@link Compressors} 833 * @param delimiter of type String 834 * @param quote of type String 835 */ 836 @ConstructorProperties({"fields", "compressor", "delimiter", "quote"}) 837 public TextDelimited( Fields fields, Compressor compressor, String delimiter, String quote ) 838 { 839 this( fields, compressor, false, delimiter, quote, null, true ); 840 } 841 842 /** 843 * Constructor TextDelimited creates a new TextDelimited instance. 844 * 845 * @param fields of type Fields 846 * @param compressor of type Compressor, see {@link Compressors} 847 * @param hasHeader of type boolean 848 * @param delimiter of type String 849 * @param quote of type String 850 */ 851 @ConstructorProperties({"fields", "compressor", "hasHeader", "delimiter", "quote"}) 852 public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, String delimiter, String quote ) 853 { 854 this( fields, compressor, hasHeader, delimiter, quote, null, true ); 855 } 856 857 /** 858 * Constructor TextDelimited creates a new TextDelimited instance. 859 * 860 * @param fields of type Fields 861 * @param compressor of type Compressor, see {@link Compressors} 862 * @param hasHeader of type boolean 863 * @param delimiter of type String 864 * @param quote of type String 865 * @param charsetName of type String 866 */ 867 @ConstructorProperties({"fields", "compressor", "hasHeader", "delimiter", "quote", "charsetName"}) 868 public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, String delimiter, String quote, String charsetName ) 869 { 870 this( fields, compressor, hasHeader, delimiter, quote, null, true, charsetName ); 871 } 872 873 /** 874 * Constructor TextDelimited creates a new TextDelimited instance. 875 * 876 * @param fields of type Fields 877 * @param compressor of type Compressor, see {@link Compressors} 878 * @param skipHeader of type boolean 879 * @param writeHeader of type boolean 880 * @param delimiter of type String 881 * @param strict of type boolean 882 * @param quote of type String 883 * @param types of type Class[] 884 * @param safe of type boolean 885 */ 886 @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "delimiter", "strict", "quote", "types", 887 "safe"}) 888 public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe ) 889 { 890 this( fields, compressor, skipHeader, writeHeader, delimiter, strict, quote, types, safe, DEFAULT_CHARSET ); 891 } 892 893 /** 894 * Constructor TextDelimited creates a new TextDelimited instance. 895 * 896 * @param fields of type Fields 897 * @param compressor of type Compressor, see {@link Compressors} 898 * @param skipHeader of type boolean 899 * @param writeHeader of type boolean 900 * @param delimiter of type String 901 * @param strict of type boolean 902 * @param quote of type String 903 * @param types of type Class[] 904 * @param safe of type boolean 905 * @param charsetName of type String 906 */ 907 @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "delimiter", "strict", "quote", "types", 908 "safe", "charsetName"}) 909 public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe, String charsetName ) 910 { 911 this( fields, compressor, skipHeader, writeHeader, charsetName, new DelimitedParser( delimiter, quote, types, strict, safe ) ); 912 } 913 914 /** 915 * Constructor TextDelimited creates a new TextDelimited instance. 916 * 917 * @param fields of type Fields 918 * @param compressor of type Compressor, see {@link Compressors} 919 * @param writeHeader of type boolean 920 * @param delimitedParser of type DelimitedParser 921 */ 922 @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "delimitedParser"}) 923 public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, DelimitedParser delimitedParser ) 924 { 925 this( fields, compressor, skipHeader, writeHeader, null, delimitedParser ); 926 } 927 928 /** 929 * Constructor TextDelimited creates a new TextDelimited instance. 930 * 931 * @param fields of type Fields 932 * @param compressor of type Compressor, see {@link Compressors} 933 * @param hasHeader of type boolean 934 * @param delimitedParser of type DelimitedParser 935 */ 936 @ConstructorProperties({"fields", "compressor", "hasHeader", "delimitedParser"}) 937 public TextDelimited( Fields fields, Compressor compressor, boolean hasHeader, DelimitedParser delimitedParser ) 938 { 939 this( fields, compressor, hasHeader, hasHeader, null, delimitedParser ); 940 } 941 942 /** 943 * Constructor TextDelimited creates a new TextDelimited instance. 944 * 945 * @param fields of type Fields 946 * @param compressor of type Compressor, see {@link Compressors} 947 * @param compressor of type Compressor, see {@link Compressors} 948 * @param writeHeader of type boolean 949 * @param charsetName of type String 950 * @param delimitedParser of type DelimitedParser 951 */ 952 @ConstructorProperties({"fields", "compressor", "skipHeader", "writeHeader", "charsetName", "delimitedParser"}) 953 public TextDelimited( Fields fields, Compressor compressor, boolean skipHeader, boolean writeHeader, String charsetName, DelimitedParser delimitedParser ) 954 { 955 super( fields, fields, compressor ); 956 957 this.delimitedParser = delimitedParser; 958 959 // normalizes ALL and UNKNOWN 960 // calls reset on delimitedParser 961 setSourceFields( fields ); 962 setSinkFields( fields ); 963 964 this.skipHeader = skipHeader; 965 this.writeHeader = writeHeader; 966 967 if( charsetName != null ) 968 this.charsetName = charsetName; 969 970 // throws an exception if not found 971 Charset.forName( this.charsetName ); 972 } 973 974 @Property(name = "charset", visibility = Visibility.PUBLIC) 975 @PropertyDescription("character set used.") 976 public String getCharsetName() 977 { 978 return charsetName; 979 } 980 981 /** 982 * Method getDelimiter returns the delimiter used to parse fields from the current line of text. 983 * 984 * @return a String 985 */ 986 @Property(name = "delimiter", visibility = Visibility.PUBLIC) 987 @PropertyDescription("The delimiter used to separate fields.") 988 public String getDelimiter() 989 { 990 return delimitedParser.getDelimiter(); 991 } 992 993 /** 994 * Method getQuote returns the quote string, if any, used to encapsulate each field in a line to delimited text. 995 * 996 * @return a String 997 */ 998 @Property(name = "quote", visibility = Visibility.PUBLIC) 999 @PropertyDescription("The string used for quoting.") 1000 public String getQuote() 1001 { 1002 return delimitedParser.getQuote(); 1003 } 1004 1005 public LineNumberReader createInput( InputStream inputStream ) 1006 { 1007 try 1008 { 1009 return new LineNumberReader( new InputStreamReader( inputStream, charsetName ) ); 1010 } 1011 catch( UnsupportedEncodingException exception ) 1012 { 1013 throw new TapException( exception ); 1014 } 1015 } 1016 1017 public PrintWriter createOutput( OutputStream outputStream ) 1018 { 1019 try 1020 { 1021 return new PrintWriter( new OutputStreamWriter( outputStream, charsetName ) ); 1022 } 1023 catch( UnsupportedEncodingException exception ) 1024 { 1025 throw new TapException( exception ); 1026 } 1027 } 1028 1029 @Override 1030 public void setSinkFields( Fields sinkFields ) 1031 { 1032 super.setSourceFields( sinkFields ); 1033 super.setSinkFields( sinkFields ); 1034 1035 if( delimitedParser != null ) 1036 delimitedParser.reset( getSourceFields(), getSinkFields() ); 1037 } 1038 1039 @Override 1040 public void setSourceFields( Fields sourceFields ) 1041 { 1042 super.setSourceFields( sourceFields ); 1043 super.setSinkFields( sourceFields ); 1044 1045 if( delimitedParser != null ) 1046 delimitedParser.reset( getSourceFields(), getSinkFields() ); 1047 } 1048 1049 @Override 1050 public boolean isSymmetrical() 1051 { 1052 return super.isSymmetrical() && skipHeader == writeHeader; 1053 } 1054 1055 @Override 1056 public Fields retrieveSourceFields( FlowProcess<? extends Properties> process, Tap tap ) 1057 { 1058 if( !skipHeader || !getSourceFields().isUnknown() ) 1059 return getSourceFields(); 1060 1061 // no need to open them all 1062 if( tap instanceof CompositeTap ) 1063 tap = (Tap) ( (CompositeTap) tap ).getChildTaps().next(); 1064 1065 tap = new FileTap( new TextLine( new Fields( "line" ), charsetName ), tap.getIdentifier() ); 1066 1067 setSourceFields( delimitedParser.parseFirstLine( process, tap ) ); 1068 1069 return getSourceFields(); 1070 } 1071 1072 @Override 1073 public void presentSourceFields( FlowProcess<? extends Properties> process, Tap tap, Fields fields ) 1074 { 1075 // do nothing 1076 } 1077 1078 @Override 1079 public void presentSinkFields( FlowProcess<? extends Properties> flowProcess, Tap tap, Fields fields ) 1080 { 1081 if( writeHeader ) 1082 presentSinkFieldsInternal( fields ); 1083 } 1084 1085 @Override 1086 public void sourceConfInit( FlowProcess<? extends Properties> flowProcess, Tap<Properties, InputStream, OutputStream> tap, Properties conf ) 1087 { 1088 } 1089 1090 @Override 1091 public void sourcePrepare( FlowProcess<? extends Properties> flowProcess, SourceCall<LineNumberReader, InputStream> sourceCall ) throws IOException 1092 { 1093 sourceCall.setContext( createInput( sourceCall.getInput() ) ); 1094 1095 sourceCall.getIncomingEntry().setTuple( TupleViews.createObjectArray() ); 1096 } 1097 1098 @Override 1099 public void sourceRePrepare( FlowProcess<? extends Properties> flowProcess, SourceCall<LineNumberReader, InputStream> sourceCall ) throws IOException 1100 { 1101 sourceCall.setContext( createInput( sourceCall.getInput() ) ); 1102 } 1103 1104 @Override 1105 public boolean source( FlowProcess<? extends Properties> flowProcess, SourceCall<LineNumberReader, InputStream> sourceCall ) throws IOException 1106 { 1107 String line = sourceCall.getContext().readLine(); 1108 1109 if( line == null ) 1110 return false; 1111 1112 if( skipHeader && sourceCall.getContext().getLineNumber() == 1 ) // todo: optimize this away 1113 line = sourceCall.getContext().readLine(); 1114 1115 if( line == null ) 1116 return false; 1117 1118 Object[] split = delimitedParser.parseLine( line ); 1119 1120 // assumption it is better to re-use than to construct new 1121 Tuple tuple = sourceCall.getIncomingEntry().getTuple(); 1122 1123 TupleViews.reset( tuple, split ); 1124 1125 return true; 1126 } 1127 1128 @Override 1129 public void sourceCleanup( FlowProcess<? extends Properties> flowProcess, SourceCall<LineNumberReader, InputStream> sourceCall ) throws IOException 1130 { 1131 sourceCall.setContext( null ); 1132 } 1133 1134 @Override 1135 public void sinkConfInit( FlowProcess<? extends Properties> flowProcess, Tap<Properties, InputStream, OutputStream> tap, Properties conf ) 1136 { 1137 } 1138 1139 @Override 1140 public void sinkPrepare( FlowProcess<? extends Properties> flowProcess, SinkCall<PrintWriter, OutputStream> sinkCall ) 1141 { 1142 OutputStream originalOutput = sinkCall.getOutput(); 1143 sinkCall.setContext( createOutput( originalOutput ) ); 1144 1145 if( writeHeader && !isAppendingFile( sinkCall, originalOutput ) ) 1146 { 1147 Fields fields = sinkCall.getOutgoingEntry().getFields(); 1148 delimitedParser.joinFirstLine( fields, sinkCall.getContext() ); 1149 1150 sinkCall.getContext().println(); 1151 } 1152 } 1153 1154 protected boolean isAppendingFile( SinkCall<PrintWriter, OutputStream> sinkCall, OutputStream originalOutput ) 1155 { 1156 try 1157 { 1158 return sinkCall.getTap().getSinkMode() == SinkMode.UPDATE && 1159 originalOutput instanceof FileOutputStream && 1160 ( (FileOutputStream) originalOutput ).getChannel().position() != 0; 1161 } 1162 catch( IOException exception ) 1163 { 1164 // the error will be thrown immediately downstream 1165 return false; 1166 } 1167 } 1168 1169 @Override 1170 public void sink( FlowProcess<? extends Properties> flowProcess, SinkCall<PrintWriter, OutputStream> sinkCall ) throws IOException 1171 { 1172 TupleEntry tupleEntry = sinkCall.getOutgoingEntry(); 1173 1174 Iterable<String> strings = tupleEntry.asIterableOf( String.class ); 1175 1176 delimitedParser.joinLine( strings, sinkCall.getContext() ); 1177 1178 sinkCall.getContext().println(); 1179 } 1180 1181 @Override 1182 public void sinkCleanup( FlowProcess<? extends Properties> flowProcess, SinkCall<PrintWriter, OutputStream> sinkCall ) 1183 { 1184 sinkCall.getContext().flush(); 1185 sinkCall.setContext( null ); 1186 } 1187 1188 @Override 1189 public String getExtension() 1190 { 1191 switch( getDelimiter().trim() ) 1192 { 1193 case "\t": 1194 return "tsv"; 1195 1196 case ",": 1197 return "csv"; 1198 } 1199 1200 return "txt"; 1201 } 1202 }