001/* 002 * Copyright (c) 2016-2017 Chris K Wensel. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.tap; 023 024import java.io.Closeable; 025import java.io.IOException; 026import java.io.Serializable; 027import java.io.UncheckedIOException; 028import java.util.Set; 029import java.util.Spliterator; 030import java.util.Spliterators; 031import java.util.stream.Stream; 032import java.util.stream.StreamSupport; 033 034import cascading.flow.Flow; 035import cascading.flow.FlowElement; 036import cascading.flow.FlowException; 037import cascading.flow.FlowProcess; 038import cascading.flow.planner.Scope; 039import cascading.flow.planner.ScopedElement; 040import cascading.management.annotation.Property; 041import cascading.management.annotation.PropertyDescription; 042import cascading.management.annotation.PropertySanitizer; 043import cascading.management.annotation.Visibility; 044import cascading.pipe.Pipe; 045import cascading.property.ConfigDef; 046import cascading.scheme.Scheme; 047import cascading.tuple.Fields; 048import cascading.tuple.FieldsResolverException; 049import cascading.tuple.Tuple; 050import cascading.tuple.TupleEntry; 051import cascading.tuple.TupleEntryCollector; 052import cascading.tuple.TupleEntryIterator; 053import cascading.util.TraceUtil; 054import cascading.util.Traceable; 055import cascading.util.Util; 056 057/** 058 * A Tap represents the physical data source or sink in a connected {@link cascading.flow.Flow}. 059 * <p> 060 * That is, a source Tap is the head end of a connected {@link Pipe} and {@link Tuple} stream, and 061 * a sink Tap is the tail end. Kinds of Tap types are used to manage files from a local disk, 062 * distributed disk, remote storage like Amazon S3, or via FTP. It simply abstracts 063 * out the complexity of connecting to these types of data sources. 064 * <p> 065 * A Tap takes a {@link Scheme} instance, which is used to identify the type of resource (text file, binary file, etc). 066 * A Tap is responsible for how the resource is reached. 067 * <p> 068 * By default when planning a Flow, Tap equality is a function of the {@link #getIdentifier()} and {@link #getScheme()} 069 * values. That is, two Tap instances are the same Tap instance if they sink/source the same resource and sink/source 070 * the same fields. 071 * <p> 072 * Some more advanced taps, like a database tap, may need to extend equality to include any filtering, like the 073 * {@code where} clause in a SQL statement so two taps reading from the same SQL table aren't considered equal. 074 * <p> 075 * Taps are also used to determine dependencies between two or more {@link Flow} instances when used with a 076 * {@link cascading.cascade.Cascade}. In that case the {@link #getFullIdentifier(Object)} value is used and the Scheme 077 * is ignored. 078 */ 079public abstract class Tap<Config, Input, Output> implements ScopedElement, FlowElement, Serializable, Traceable 080 { 081 /** Field scheme */ 082 private Scheme<Config, Input, Output, ?, ?> scheme; 083 084 /** Field mode */ 085 SinkMode sinkMode = SinkMode.KEEP; 086 087 private ConfigDef configDef; 088 private ConfigDef nodeConfigDef; 089 private ConfigDef stepConfigDef; 090 091 /** Field id */ 092 private final String id = Util.createUniqueID(); // 3.0 planner relies on this being consistent 093 /** Field trace */ 094 private String trace = TraceUtil.captureDebugTrace( this ); // see TraceUtil.setTrace() to override 095 096 /** 097 * Convenience function to make an array of Tap instances. 098 * 099 * @param taps of type Tap 100 * @return Tap array 101 */ 102 public static Tap[] taps( Tap... taps ) 103 { 104 return taps; 105 } 106 107 /** 108 * Creates and returns a unique ID for the given Tap, this value is cached and may be used to uniquely identify 109 * the Tap instance in properties files etc. 110 * <p> 111 * This value is generally reproducible assuming the Tap identifier and the Scheme source and sink Fields remain consistent. 112 * 113 * @param tap of type Tap 114 * @return of type String 115 */ 116 public static synchronized String id( Tap tap ) 117 { 118 if( tap instanceof DecoratorTap ) 119 return id( ( (DecoratorTap) tap ).getOriginal() ); 120 121 return tap.id; 122 } 123 124 protected Tap() 125 { 126 } 127 128 protected Tap( Scheme<Config, Input, Output, ?, ?> scheme ) 129 { 130 this.setScheme( scheme ); 131 } 132 133 protected Tap( Scheme<Config, Input, Output, ?, ?> scheme, SinkMode sinkMode ) 134 { 135 this.setScheme( scheme ); 136 this.sinkMode = sinkMode; 137 } 138 139 protected void setScheme( Scheme<Config, Input, Output, ?, ?> scheme ) 140 { 141 this.scheme = scheme; 142 } 143 144 /** 145 * Method getScheme returns the scheme of this Tap object. 146 * 147 * @return the scheme (type Scheme) of this Tap object. 148 */ 149 public Scheme<Config, Input, Output, ?, ?> getScheme() 150 { 151 return scheme; 152 } 153 154 @Override 155 public String getTrace() 156 { 157 return trace; 158 } 159 160 /** 161 * Method flowInit allows this Tap instance to initialize itself in context of the given {@link cascading.flow.Flow} instance. 162 * This method is guaranteed to be called before the Flow is started and the 163 * {@link cascading.flow.FlowListener#onStarting(cascading.flow.Flow)} event is fired. 164 * <p> 165 * This method will be called once per Flow, and before {@link #sourceConfInit(cascading.flow.FlowProcess, Object)} and 166 * {@link #sinkConfInit(cascading.flow.FlowProcess, Object)} methods. 167 * 168 * @param flow of type Flow 169 */ 170 public void flowConfInit( Flow<Config> flow ) 171 { 172 173 } 174 175 /** 176 * Method sourceConfInit initializes this instance as a source. 177 * <p> 178 * This method maybe called more than once if this Tap instance is used outside the scope of a {@link cascading.flow.Flow} 179 * instance or if it participates in multiple times in a given Flow or across different Flows in 180 * a {@link cascading.cascade.Cascade}. 181 * <p> 182 * In the context of a Flow, it will be called after 183 * {@link cascading.flow.FlowListener#onStarting(cascading.flow.Flow)} 184 * <p> 185 * Note that no resources or services should be modified by this method. 186 * 187 * @param flowProcess of type FlowProcess 188 * @param conf of type Config 189 */ 190 public void sourceConfInit( FlowProcess<? extends Config> flowProcess, Config conf ) 191 { 192 getScheme().sourceConfInit( flowProcess, this, conf ); 193 } 194 195 /** 196 * Method sinkConfInit initializes this instance as a sink. 197 * <p> 198 * This method maybe called more than once if this Tap instance is used outside the scope of a {@link cascading.flow.Flow} 199 * instance or if it participates in multiple times in a given Flow or across different Flows in 200 * a {@link cascading.cascade.Cascade}. 201 * <p> 202 * Note this method will be called in context of this Tap being used as a traditional 'sink' and as a 'trap'. 203 * <p> 204 * In the context of a Flow, it will be called after 205 * {@link cascading.flow.FlowListener#onStarting(cascading.flow.Flow)} 206 * <p> 207 * Note that no resources or services should be modified by this method. If this Tap instance returns true for 208 * {@link #isReplace()}, then {@link #deleteResource(Object)} will be called by the parent Flow. 209 * 210 * @param flowProcess of type FlowProcess 211 * @param conf of type Config 212 */ 213 public void sinkConfInit( FlowProcess<? extends Config> flowProcess, Config conf ) 214 { 215 getScheme().sinkConfInit( flowProcess, this, conf ); 216 } 217 218 /** 219 * Method getIdentifier returns a String representing the resource this Tap instance represents. 220 * <p> 221 * Often, if the tap accesses a filesystem, the identifier is nothing more than the path to the file or directory. 222 * In other cases it may be a an URL or URI representing a connection string or remote resource. 223 * <p> 224 * Any two Tap instances having the same value for the identifier are considered equal. 225 * 226 * @return String 227 */ 228 @Property(name = "identifier", visibility = Visibility.PUBLIC) 229 @PropertyDescription("The resource this instance represents") 230 @PropertySanitizer("cascading.management.annotation.URISanitizer") 231 public abstract String getIdentifier(); 232 233 /** 234 * Method getSourceFields returns the sourceFields of this Tap object. 235 * 236 * @return the sourceFields (type Fields) of this Tap object. 237 */ 238 public Fields getSourceFields() 239 { 240 return getScheme().getSourceFields(); 241 } 242 243 /** 244 * Method getSinkFields returns the sinkFields of this Tap object. 245 * 246 * @return the sinkFields (type Fields) of this Tap object. 247 */ 248 public Fields getSinkFields() 249 { 250 return getScheme().getSinkFields(); 251 } 252 253 /** 254 * Method openForRead opens the resource represented by this Tap instance for reading. 255 * <p> 256 * {@code input} value may be null, if so, sub-classes must inquire with the underlying {@link Scheme} 257 * via {@link Scheme#sourceConfInit(cascading.flow.FlowProcess, Tap, Object)} to get the proper 258 * input type and instantiate it before calling {@code super.openForRead()}. 259 * <p> 260 * Note the returned iterator will return the same instance of {@link cascading.tuple.TupleEntry} on every call, 261 * thus a copy must be made of either the TupleEntry or the underlying {@code Tuple} instance if they are to be 262 * stored in a Collection. 263 * 264 * @param flowProcess of type FlowProcess 265 * @param input of type Input 266 * @return TupleEntryIterator 267 * @throws java.io.IOException when the resource cannot be opened 268 */ 269 public abstract TupleEntryIterator openForRead( FlowProcess<? extends Config> flowProcess, Input input ) throws IOException; 270 271 /** 272 * Method openForRead opens the resource represented by this Tap instance for reading. 273 * <p> 274 * Note the returned iterator will return the same instance of {@link cascading.tuple.TupleEntry} on every call, 275 * thus a copy must be made of either the TupleEntry or the underlying {@code Tuple} instance if they are to be 276 * stored in a Collection. 277 * 278 * @param flowProcess of type FlowProcess 279 * @return TupleEntryIterator 280 * @throws java.io.IOException when the resource cannot be opened 281 */ 282 public TupleEntryIterator openForRead( FlowProcess<? extends Config> flowProcess ) throws IOException 283 { 284 return openForRead( flowProcess, null ); 285 } 286 287 /** 288 * Method openForWrite opens the resource represented by this Tap instance for writing. 289 * <p> 290 * This method is used internally and does not honor the {@link SinkMode} setting. If SinkMode is 291 * {@link SinkMode#REPLACE}, this call may fail. See {@link #openForWrite(cascading.flow.FlowProcess)}. 292 * <p> 293 * {@code output} value may be null, if so, sub-classes must inquire with the underlying {@link Scheme} 294 * via {@link Scheme#sinkConfInit(cascading.flow.FlowProcess, Tap, Object)} to get the proper 295 * output type and instantiate it before calling {@code super.openForWrite()}. 296 * 297 * @param flowProcess of type FlowProcess 298 * @param output of type Output 299 * @return TupleEntryCollector 300 * @throws java.io.IOException when the resource cannot be opened 301 */ 302 public abstract TupleEntryCollector openForWrite( FlowProcess<? extends Config> flowProcess, Output output ) throws IOException; 303 304 /** 305 * Method openForWrite opens the resource represented by this Tap instance for writing. 306 * <p> 307 * This method is for user application use and does honor the {@link SinkMode#REPLACE} settings. That is, if 308 * SinkMode is set to {@link SinkMode#REPLACE} the underlying resource will be deleted. 309 * <p> 310 * Note if {@link SinkMode#UPDATE} is set, the resource will not be deleted. 311 * 312 * @param flowProcess of type FlowProcess 313 * @return TupleEntryCollector 314 * @throws java.io.IOException when the resource cannot be opened 315 */ 316 public TupleEntryCollector openForWrite( FlowProcess<? extends Config> flowProcess ) throws IOException 317 { 318 if( isReplace() ) 319 deleteResource( flowProcess ); 320 321 return openForWrite( flowProcess, null ); 322 } 323 324 @Override 325 public Scope outgoingScopeFor( Set<Scope> incomingScopes ) 326 { 327 // as a source Tap, we emit the scheme defined Fields 328 // as a sink Tap, we declare we emit the incoming Fields 329 // as a temp Tap, this method never gets called, but we emit what we consume 330 int count = 0; 331 for( Scope incomingScope : incomingScopes ) 332 { 333 Fields incomingFields = incomingScope.getIncomingTapFields(); 334 335 if( incomingFields != null ) 336 { 337 try 338 { 339 incomingFields.select( getSinkFields() ); 340 } 341 catch( FieldsResolverException exception ) 342 { 343 throw new TapException( this, exception.getSourceFields(), exception.getSelectorFields(), exception ); 344 } 345 346 count++; 347 } 348 } 349 350 if( count > 1 ) 351 throw new FlowException( "Tap may not have more than one incoming Scope" ); 352 353 // this allows the incoming to be passed through to the outgoing 354 Fields incomingFields = incomingScopes.size() == 0 ? null : incomingScopes.iterator().next().getIncomingTapFields(); 355 356 if( incomingFields != null && 357 ( isSource() && getSourceFields().equals( Fields.UNKNOWN ) || 358 isSink() && getSinkFields().equals( Fields.ALL ) ) ) 359 return new Scope( incomingFields ); 360 361 if( count == 1 ) 362 return new Scope( getSinkFields() ); 363 364 return new Scope( getSourceFields() ); 365 } 366 367 /** 368 * A hook for allowing a Scheme to lazily retrieve its source fields. 369 * 370 * @param flowProcess of type FlowProcess 371 * @return the found Fields 372 */ 373 public Fields retrieveSourceFields( FlowProcess<? extends Config> flowProcess ) 374 { 375 return getScheme().retrieveSourceFields( flowProcess, this ); 376 } 377 378 public void presentSourceFields( FlowProcess<? extends Config> flowProcess, Fields fields ) 379 { 380 getScheme().presentSourceFields( flowProcess, this, fields ); 381 } 382 383 /** 384 * A hook for allowing a Scheme to lazily retrieve its sink fields. 385 * 386 * @param flowProcess of type FlowProcess 387 * @return the found Fields 388 */ 389 public Fields retrieveSinkFields( FlowProcess<? extends Config> flowProcess ) 390 { 391 return getScheme().retrieveSinkFields( flowProcess, this ); 392 } 393 394 public void presentSinkFields( FlowProcess<? extends Config> flowProcess, Fields fields ) 395 { 396 getScheme().presentSinkFields( flowProcess, this, fields ); 397 } 398 399 @Override 400 public Fields resolveIncomingOperationArgumentFields( Scope incomingScope ) 401 { 402 return incomingScope.getIncomingTapFields(); 403 } 404 405 @Override 406 public Fields resolveIncomingOperationPassThroughFields( Scope incomingScope ) 407 { 408 return incomingScope.getIncomingTapFields(); 409 } 410 411 /** 412 * Method getFullIdentifier returns a fully qualified resource identifier. 413 * 414 * @param flowProcess of type FlowProcess 415 * @return String 416 */ 417 public String getFullIdentifier( FlowProcess<? extends Config> flowProcess ) 418 { 419 return getFullIdentifier( flowProcess.getConfig() ); 420 } 421 422 /** 423 * Method getFullIdentifier returns a fully qualified resource identifier. 424 * 425 * @param conf of type Config 426 * @return String 427 */ 428 public String getFullIdentifier( Config conf ) 429 { 430 return getIdentifier(); 431 } 432 433 /** 434 * Method createResource creates the underlying resource. 435 * 436 * @param flowProcess of type FlowProcess 437 * @return boolean 438 * @throws IOException when there is an error making directories 439 */ 440 public boolean createResource( FlowProcess<? extends Config> flowProcess ) throws IOException 441 { 442 return createResource( flowProcess.getConfig() ); 443 } 444 445 /** 446 * Method createResource creates the underlying resource. 447 * 448 * @param conf of type Config 449 * @return boolean 450 * @throws IOException when there is an error making directories 451 */ 452 public abstract boolean createResource( Config conf ) throws IOException; 453 454 /** 455 * Method deleteResource deletes the resource represented by this instance. 456 * 457 * @param flowProcess of type FlowProcess 458 * @return boolean 459 * @throws IOException when the resource cannot be deleted 460 */ 461 public boolean deleteResource( FlowProcess<? extends Config> flowProcess ) throws IOException 462 { 463 return deleteResource( flowProcess.getConfig() ); 464 } 465 466 /** 467 * Method deleteResource deletes the resource represented by this instance. 468 * 469 * @param conf of type Config 470 * @return boolean 471 * @throws IOException when the resource cannot be deleted 472 */ 473 public abstract boolean deleteResource( Config conf ) throws IOException; 474 475 /** 476 * Method prepareResourceForRead allows the underlying resource to be notified when reading will begin. 477 * <p> 478 * This method will be called client side so that any remote or external resources can be initialized. 479 * <p> 480 * If this method returns {@code false}, an exception will be thrown halting the current Flow. 481 * <p> 482 * In most cases, resource initialization should happen in the {@link #openForRead(FlowProcess, Object)} method. 483 * <p> 484 * This allows for initialization of cluster side resources, like a JDBC driver used to read data from a database, 485 * that cannot be passed client to cluster. 486 * 487 * @param conf of type Config 488 * @return returns true if successful 489 * @throws IOException 490 */ 491 public boolean prepareResourceForRead( Config conf ) throws IOException 492 { 493 return true; 494 } 495 496 /** 497 * Method prepareResourceForWrite allows the underlying resource to be notified when writing will begin. 498 * <p> 499 * This method will be called once client side so that any remote or external resources can be initialized. 500 * <p> 501 * If this method returns {@code false}, an exception will be thrown halting the current Flow. 502 * <p> 503 * In most cases, resource initialization should happen in the {@link #openForWrite(FlowProcess, Object)} method. 504 * <p> 505 * This allows for initialization of cluster side resources, like a JDBC driver used to write data to a database, 506 * that cannot be passed client to cluster. 507 * <p> 508 * In the above JDBC example, overriding this method will allow for testing for the existence of and/or creating 509 * a remote table used by all individual cluster side tasks. 510 * 511 * @param conf of type Config 512 * @return returns true if successful 513 * @throws IOException 514 */ 515 public boolean prepareResourceForWrite( Config conf ) throws IOException 516 { 517 return true; 518 } 519 520 /** 521 * Method commitResource allows the underlying resource to be notified when all write processing is 522 * successful so that any additional cleanup or processing may be completed. 523 * <p> 524 * See {@link #rollbackResource(Object)} to handle cleanup in the face of failures. 525 * <p> 526 * This method is invoked once client side and not in the cluster, if any. 527 * <p> 528 * If other sink Tap instance in a given Flow fail on commitResource after called on this instance, 529 * rollbackResource will not be called. 530 * 531 * @param conf of type Config 532 * @return returns true if successful 533 * @throws IOException 534 */ 535 public boolean commitResource( Config conf ) throws IOException 536 { 537 return true; 538 } 539 540 /** 541 * Method rollbackResource allows the underlying resource to be notified when any write processing has failed or 542 * was stopped so that any cleanup may be started. 543 * <p> 544 * See {@link #commitResource(Object)} to handle cleanup when the write has successfully completed. 545 * <p> 546 * This method is invoked once client side and not in the cluster, if any. 547 * 548 * @param conf of type Config 549 * @return returns true if successful 550 * @throws IOException 551 */ 552 public boolean rollbackResource( Config conf ) throws IOException 553 { 554 return true; 555 } 556 557 /** 558 * Method resourceExists returns true if the path represented by this instance exists. 559 * 560 * @param flowProcess of type FlowProcess 561 * @return true if the underlying resource already exists 562 * @throws IOException when the status cannot be determined 563 */ 564 public boolean resourceExists( FlowProcess<? extends Config> flowProcess ) throws IOException 565 { 566 return resourceExists( flowProcess.getConfig() ); 567 } 568 569 /** 570 * Method resourceExists returns true if the path represented by this instance exists. 571 * 572 * @param conf of type Config 573 * @return true if the underlying resource already exists 574 * @throws IOException when the status cannot be determined 575 */ 576 public abstract boolean resourceExists( Config conf ) throws IOException; 577 578 /** 579 * Method getModifiedTime returns the date this resource was last modified. 580 * <p> 581 * If the resource does not exist, returns zero (0). 582 * <p> 583 * If the resource is continuous, returns {@link Long#MAX_VALUE}. 584 * 585 * @param flowProcess of type FlowProcess 586 * @return The date this resource was last modified. 587 * @throws IOException 588 */ 589 public long getModifiedTime( FlowProcess<? extends Config> flowProcess ) throws IOException 590 { 591 return getModifiedTime( flowProcess.getConfig() ); 592 } 593 594 /** 595 * Method getModifiedTime returns the date this resource was last modified. 596 * <p> 597 * If the resource does not exist, returns zero (0). 598 * <p> 599 * If the resource is continuous, returns {@link Long#MAX_VALUE}. 600 * 601 * @param conf of type Config 602 * @return The date this resource was last modified. 603 * @throws IOException 604 */ 605 public abstract long getModifiedTime( Config conf ) throws IOException; 606 607 /** 608 * Method getSinkMode returns the {@link SinkMode} }of this Tap object. 609 * 610 * @return the sinkMode (type SinkMode) of this Tap object. 611 */ 612 public SinkMode getSinkMode() 613 { 614 return sinkMode; 615 } 616 617 /** 618 * Method isKeep indicates whether the resource represented by this instance should be kept if it 619 * already exists when the Flow is started. 620 * 621 * @return boolean 622 */ 623 public boolean isKeep() 624 { 625 return sinkMode == SinkMode.KEEP; 626 } 627 628 /** 629 * Method isReplace indicates whether the resource represented by this instance should be deleted if it 630 * already exists when the Flow is started. 631 * 632 * @return boolean 633 */ 634 public boolean isReplace() 635 { 636 return sinkMode == SinkMode.REPLACE; 637 } 638 639 /** 640 * Method isUpdate indicates whether the resource represented by this instance should be updated if it already 641 * exists. Otherwise a new resource will be created, via {@link #createResource(Object)}, when the Flow is started. 642 * 643 * @return boolean 644 */ 645 public boolean isUpdate() 646 { 647 return sinkMode == SinkMode.UPDATE; 648 } 649 650 /** 651 * Method isSink returns true if this Tap instance can be used as a sink. 652 * 653 * @return boolean 654 */ 655 public boolean isSink() 656 { 657 return getScheme().isSink(); 658 } 659 660 /** 661 * Method isSource returns true if this Tap instance can be used as a source. 662 * 663 * @return boolean 664 */ 665 public boolean isSource() 666 { 667 return getScheme().isSource(); 668 } 669 670 /** 671 * Method isTemporary returns true if this Tap is temporary (used for intermediate results). 672 * 673 * @return the temporary (type boolean) of this Tap object. 674 */ 675 public boolean isTemporary() 676 { 677 return false; 678 } 679 680 /** 681 * Returns a {@link cascading.property.ConfigDef} instance that allows for local properties to be set and made available via 682 * a resulting {@link cascading.flow.FlowProcess} instance when the tap is invoked. 683 * <p> 684 * Any properties set on the configDef will not show up in any {@link Flow} or {@link cascading.flow.FlowStep} process 685 * level configuration, but will override any of those values as seen by the current Tap instance method call where a 686 * FlowProcess is provided except for the {@link #sourceConfInit(cascading.flow.FlowProcess, Object)} and 687 * {@link #sinkConfInit(cascading.flow.FlowProcess, Object)} methods. 688 * <p> 689 * That is, the {@code *confInit} methods are called before any ConfigDef is applied, so any values placed into 690 * a ConfigDef instance will not be visible to them. 691 * 692 * @return an instance of ConfigDef 693 */ 694 public ConfigDef getConfigDef() 695 { 696 if( configDef == null ) 697 configDef = new ConfigDef(); 698 699 return configDef; 700 } 701 702 /** 703 * Returns {@code true} if there are properties in the configDef instance. 704 * 705 * @return true if there are configDef properties 706 */ 707 public boolean hasConfigDef() 708 { 709 return configDef != null && !configDef.isEmpty(); 710 } 711 712 /** 713 * Returns a {@link ConfigDef} instance that allows for process level properties to be set and made available via 714 * a resulting {@link cascading.flow.FlowProcess} instance when the tap is invoked. 715 * <p> 716 * Any properties set on the nodeConfigDef will not show up in any Flow configuration, but will show up in 717 * the current process {@link cascading.flow.FlowNode} (in Apache Tez the Vertex configuration). Any value set in the 718 * nodeConfigDef will be overridden by the pipe local {@code #getConfigDef} instance. 719 * <p> 720 * Use this method to tweak properties in the process node this tap instance is planned into. 721 * 722 * @return an instance of ConfigDef 723 */ 724 @Override 725 public ConfigDef getNodeConfigDef() 726 { 727 if( nodeConfigDef == null ) 728 nodeConfigDef = new ConfigDef(); 729 730 return nodeConfigDef; 731 } 732 733 /** 734 * Returns {@code true} if there are properties in the nodeConfigDef instance. 735 * 736 * @return true if there are nodeConfigDef properties 737 */ 738 @Override 739 public boolean hasNodeConfigDef() 740 { 741 return nodeConfigDef != null && !nodeConfigDef.isEmpty(); 742 } 743 744 /** 745 * Returns a {@link ConfigDef} instance that allows for process level properties to be set and made available via 746 * a resulting {@link cascading.flow.FlowProcess} instance when the tap is invoked. 747 * <p> 748 * Any properties set on the stepConfigDef will not show up in any Flow configuration, but will show up in 749 * the current process {@link cascading.flow.FlowStep} (in Hadoop the MapReduce jobconf). Any value set in the 750 * stepConfigDef will be overridden by the tap local {@code #getConfigDef} instance. 751 * <p> 752 * Use this method to tweak properties in the process step this tap instance is planned into. 753 * <p> 754 * Note the {@code *confInit} methods are called before any ConfigDef is applied, so any values placed into 755 * a ConfigDef instance will not be visible to them. 756 * 757 * @return an instance of ConfigDef 758 */ 759 @Override 760 public ConfigDef getStepConfigDef() 761 { 762 if( stepConfigDef == null ) 763 stepConfigDef = new ConfigDef(); 764 765 return stepConfigDef; 766 } 767 768 /** 769 * Returns {@code true} if there are properties in the stepConfigDef instance. 770 * 771 * @return true if there are stepConfigDef properties 772 */ 773 @Override 774 public boolean hasStepConfigDef() 775 { 776 return stepConfigDef != null && !stepConfigDef.isEmpty(); 777 } 778 779 public Spliterator<TupleEntry> spliterator( FlowProcess<? extends Config> flowProcess ) 780 { 781 return splititerator( openForReadUnchecked( flowProcess ) ); 782 } 783 784 protected TupleEntryIterator openForReadUnchecked( FlowProcess<? extends Config> flowProcess ) 785 { 786 try 787 { 788 return openForRead( flowProcess ); 789 } 790 catch( IOException exception ) 791 { 792 throw new UncheckedIOException( exception ); 793 } 794 } 795 796 protected Spliterator<TupleEntry> splititerator( TupleEntryIterator iterator ) 797 { 798 return Spliterators.spliteratorUnknownSize( iterator, 0 ); 799 } 800 801 /** 802 * Method entryStream returns a {@link Stream} of {@link TupleEntry} instances from the given 803 * Tap instance. 804 * <p> 805 * Also see {@link cascading.tuple.TupleEntryStream#entryStream(Tap, FlowProcess)}. 806 * <p> 807 * Note, the returned Stream instance must be closed in order to clean up underlying resources. This 808 * is simply accomplished with a try-with-resources statement. 809 * 810 * @param flowProcess represents the current platform configuration 811 * @return a Stream of TupleEntry instances 812 */ 813 public Stream<TupleEntry> entryStream( FlowProcess<? extends Config> flowProcess ) 814 { 815 TupleEntryIterator iterator = openForReadUnchecked( flowProcess ); 816 Spliterator<TupleEntry> spliterator = splititerator( iterator ); 817 818 try 819 { 820 return StreamSupport 821 .stream( spliterator, false ) 822 .onClose( asUncheckedRunnable( iterator ) ); 823 } 824 catch( Error | RuntimeException error ) 825 { 826 try 827 { 828 iterator.close(); 829 } 830 catch( IOException exception ) 831 { 832 try 833 { 834 error.addSuppressed( exception ); 835 } 836 catch( Throwable ignore ){} 837 } 838 839 throw error; 840 } 841 } 842 843 /** 844 * Method entryStreamCopy returns a {@link Stream} of {@link TupleEntry} instances from the given 845 * Tap instance. 846 * <p> 847 * This method returns an TupleEntry instance suitable for caching. 848 * <p> 849 * Also see {@link cascading.tuple.TupleEntryStream#entryStreamCopy(Tap, FlowProcess)}. 850 * <p> 851 * Note, the returned Stream instance must be closed in order to clean up underlying resources. This 852 * is simply accomplished with a try-with-resources statement. 853 * 854 * @param flowProcess represents the current platform configuration 855 * @return a Stream of TupleEntry instances 856 */ 857 public Stream<TupleEntry> entryStreamCopy( FlowProcess<? extends Config> flowProcess ) 858 { 859 return entryStream( flowProcess ).map( TupleEntry::new ); 860 } 861 862 /** 863 * Method entryStream returns a {@link Stream} of {@link TupleEntry} instances from the given 864 * Tap instance. 865 * <p> 866 * Also see {@link cascading.tuple.TupleEntryStream#entryStream(Tap, FlowProcess, Fields)}. 867 * <p> 868 * Note, the returned Stream instance must be closed in order to clean up underlying resources. This 869 * is simply accomplished with a try-with-resources statement. 870 * 871 * @param flowProcess represents the current platform configuration 872 * @param selector the fields to select from the underlying TupleEntry 873 * @return a Stream of TupleEntry instances 874 */ 875 public Stream<TupleEntry> entryStream( FlowProcess<? extends Config> flowProcess, Fields selector ) 876 { 877 return entryStream( flowProcess ).map( tupleEntry -> tupleEntry.selectEntry( selector ) ); 878 } 879 880 /** 881 * Method entryStreamCopy returns a {@link Stream} of {@link TupleEntry} instances from the given 882 * Tap instance. 883 * <p> 884 * Also see {@link cascading.tuple.TupleEntryStream#entryStreamCopy(Tap, FlowProcess)}. 885 * <p> 886 * Note, the returned Stream instance must be closed in order to clean up underlying resources. This 887 * is simply accomplished with a try-with-resources statement. 888 * 889 * @param flowProcess represents the current platform configuration 890 * @param selector the fields to select from the underlying TupleEntry 891 * @return a Stream of TupleEntry instances 892 */ 893 public Stream<TupleEntry> entryStreamCopy( FlowProcess<? extends Config> flowProcess, Fields selector ) 894 { 895 return entryStream( flowProcess ).map( tupleEntry -> tupleEntry.selectEntryCopy( selector ) ); 896 } 897 898 /** 899 * Method tupleStream returns a {@link Stream} of {@link Tuple} instances from the given 900 * Tap instance. 901 * <p> 902 * Also see {@link cascading.tuple.TupleStream#tupleStream(Tap, FlowProcess)}. 903 * 904 * @param flowProcess represents the current platform configuration 905 * @return a Stream of Tuple instances 906 */ 907 public Stream<Tuple> tupleStream( FlowProcess<? extends Config> flowProcess ) 908 { 909 return entryStream( flowProcess ).map( TupleEntry::getTuple ); 910 } 911 912 /** 913 * Method tupleStreamCopy returns a {@link Stream} of {@link Tuple} instances from the given 914 * Tap instance. 915 * <p> 916 * This method returns an Tuple instance suitable for caching. 917 * <p> 918 * Also see {@link cascading.tuple.TupleStream#tupleStreamCopy(Tap, FlowProcess)}. 919 * 920 * @param flowProcess represents the current platform configuration 921 * @return a Stream of Tuple instances 922 */ 923 public Stream<Tuple> tupleStreamCopy( FlowProcess<? extends Config> flowProcess ) 924 { 925 return entryStream( flowProcess ).map( TupleEntry::getTupleCopy ); 926 } 927 928 /** 929 * Method tupleStream returns a {@link Stream} of {@link Tuple} instances from the given 930 * Tap instance. 931 * <p> 932 * Also see {@link cascading.tuple.TupleStream#tupleStream(Tap, FlowProcess, Fields)}. 933 * 934 * @param flowProcess represents the current platform configuration 935 * @param selector the fields to select from the underlying Tuple 936 * @return a Stream of TupleE instances 937 */ 938 public Stream<Tuple> tupleStream( FlowProcess<? extends Config> flowProcess, Fields selector ) 939 { 940 return entryStream( flowProcess ).map( tupleEntry -> tupleEntry.selectTuple( selector ) ); 941 } 942 943 /** 944 * Method tupleStreamCopy returns a {@link Stream} of {@link Tuple} instances from the given 945 * Tap instance. 946 * <p> 947 * This method returns an Tuple instance suitable for caching. 948 * <p> 949 * Also see {@link cascading.tuple.TupleStream#tupleStreamCopy(Tap, FlowProcess)}. 950 * 951 * @param flowProcess represents the current platform configuration 952 * @param selector the fields to select from the underlying Tuple 953 * @return a Stream of TupleE instances 954 */ 955 public Stream<Tuple> tupleStreamCopy( FlowProcess<? extends Config> flowProcess, Fields selector ) 956 { 957 return entryStream( flowProcess ).map( tupleEntry -> tupleEntry.selectTupleCopy( selector ) ); 958 } 959 960 private static Runnable asUncheckedRunnable( Closeable closeable ) 961 { 962 return () -> 963 { 964 try 965 { 966 closeable.close(); 967 } 968 catch( IOException exception ) 969 { 970 throw new UncheckedIOException( exception ); 971 } 972 }; 973 } 974 975 @Override 976 public boolean equals( Object object ) 977 { 978 if( this == object ) 979 return true; 980 if( object == null || getClass() != object.getClass() ) 981 return false; 982 983 Tap tap = (Tap) object; 984 985 if( getIdentifier() != null ? !getIdentifier().equals( tap.getIdentifier() ) : tap.getIdentifier() != null ) 986 return false; 987 988 if( getScheme() != null ? !getScheme().equals( tap.getScheme() ) : tap.getScheme() != null ) 989 return false; 990 991 return true; 992 } 993 994 @Override 995 public int hashCode() 996 { 997 int result = getIdentifier() != null ? getIdentifier().hashCode() : 0; 998 999 result = 31 * result + ( getScheme() != null ? getScheme().hashCode() : 0 ); 1000 1001 return result; 1002 } 1003 1004 @Override 1005 public String toString() 1006 { 1007 if( getIdentifier() != null ) 1008 return getClass().getSimpleName() + "[\"" + getScheme() + "\"]" + "[\"" + Util.sanitizeUrl( getIdentifier() ) + "\"]"; // sanitize 1009 else 1010 return getClass().getSimpleName() + "[\"" + getScheme() + "\"]" + "[not initialized]"; 1011 } 1012 }