04_Flink Streaming DataStream

来源:互联网 发布:js 7.62mm狙击步枪 编辑:程序博客网 时间:2024/04/30 01:58

DataStream对象,简称ds,作为流计算编程的核心上下文。提供多种功能

//1:接入数据源

1:ds由env和SourceFunction产生,完成接入数据源的功能

//2:数据处理

2:filter。实现过滤功能

3:map,实现1->1的映射转换

4:flatmap,实现拆解字符串等复杂的转换功能

5:project,实现字段裁剪

6:transform实现复杂需求的功能

//3:join流

7:coGroup,流合并

8:join,实现流join

0:union,merge两个流

//4:分组,种类太多。也没整明白。大多只用keyBy就行。其他都有特殊场景,或者性能优化用处的

9:keyBy,实现按指定key的value值的hash,实现数据分组

10:shuffle,实现数据随机分组

11:broadcast,实现数据的广播

12:forward,实现本地task分组

13:rebalance,使用随机函数,进行数据分组

14:rescale,数据随机的分组到一个instance子集

15:global,将数据发送到,下个操作的第一个instance

15:partitionCustom,实现数据的自定义分组

//5:窗口函数,跳动窗口,滑动窗口两种类型

16:timeWindowAll,实现时间的滑动窗口和跳动窗口

17:countWindowAll,实现事件个数的滑动窗口和跳动窗口

18:windowAll,自定义窗口。时间和事件个数的窗口就是基于这个实现的。

//6:事件时间

19:assignTimestampsAndWatermarks,为事件指定时间,和WATERMARK类型。


//:输出

20:addSink,根据SinkFunction的实现,实现数据处理的结点,一般是数据落地

21:writeUsingOutputFormat,通过OutputFormatSinkFunction,封装了addSink,根据OutputFormat实现类,实现数据落地。


org.apache.flink.streaming.api.datastream.DataStream.union(DataStream<T>...)
org.apache.flink.streaming.api.datastream.DataStream.split(OutputSelector<T>)
org.apache.flink.streaming.api.datastream.DataStream.connect(DataStream<R>)
org.apache.flink.streaming.api.datastream.DataStream.keyBy(KeySelector<T, K>)
org.apache.flink.streaming.api.datastream.DataStream.keyBy(int...)
org.apache.flink.streaming.api.datastream.DataStream.keyBy(String...)
org.apache.flink.streaming.api.datastream.DataStream.keyBy(Keys<T>)
org.apache.flink.streaming.api.datastream.DataStream.partitionCustom(Partitioner<K>, int)
org.apache.flink.streaming.api.datastream.DataStream.partitionCustom(Partitioner<K>, String)
org.apache.flink.streaming.api.datastream.DataStream.partitionCustom(Partitioner<K>, KeySelector<T, K>)
org.apache.flink.streaming.api.datastream.DataStream.partitionCustom(Partitioner<K>, Keys<T>)
org.apache.flink.streaming.api.datastream.DataStream.broadcast()
org.apache.flink.streaming.api.datastream.DataStream.shuffle()
org.apache.flink.streaming.api.datastream.DataStream.forward()
org.apache.flink.streaming.api.datastream.DataStream.rebalance()
org.apache.flink.streaming.api.datastream.DataStream.rescale()
org.apache.flink.streaming.api.datastream.DataStream.global()
org.apache.flink.streaming.api.datastream.DataStream.map(MapFunction<T, R>)
org.apache.flink.streaming.api.datastream.DataStream.flatMap(FlatMapFunction<T, R>)
org.apache.flink.streaming.api.datastream.DataStream.filter(FilterFunction<T>)
org.apache.flink.streaming.api.datastream.DataStream.project(int...)
org.apache.flink.streaming.api.datastream.DataStream.coGroup(DataStream<T2>)
org.apache.flink.streaming.api.datastream.DataStream.join(DataStream<T2>)
org.apache.flink.streaming.api.datastream.DataStream.timeWindowAll(Time)
org.apache.flink.streaming.api.datastream.DataStream.timeWindowAll(Time, Time)
org.apache.flink.streaming.api.datastream.DataStream.countWindowAll(long)
org.apache.flink.streaming.api.datastream.DataStream.countWindowAll(long, long)
org.apache.flink.streaming.api.datastream.DataStream.windowAll(WindowAssigner<? super T, W>)
org.apache.flink.streaming.api.datastream.DataStream.assignTimestamps(TimestampExtractor<T>)
org.apache.flink.streaming.api.datastream.DataStream.assignTimestampsAndWatermarks(AssignerWithPeriodicWatermarks<T>)
org.apache.flink.streaming.api.datastream.DataStream.assignTimestampsAndWatermarks(AssignerWithPunctuatedWatermarks<T>)
org.apache.flink.streaming.api.datastream.DataStream.writeUsingOutputFormat(OutputFormat<T>)
org.apache.flink.streaming.api.datastream.DataStream.transform(String, TypeInformation<R>, OneInputStreamOperator<T, R>)
org.apache.flink.streaming.api.datastream.DataStream.addSink(SinkFunction<T>)


/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *    http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.flink.streaming.api.datastream;import java.util.ArrayList;import java.util.List;import org.apache.flink.annotation.PublicEvolving;import org.apache.flink.annotation.Internal;import org.apache.flink.annotation.Public;import org.apache.flink.api.common.ExecutionConfig;import org.apache.flink.api.common.functions.FilterFunction;import org.apache.flink.api.common.functions.FlatMapFunction;import org.apache.flink.api.common.functions.MapFunction;import org.apache.flink.api.common.functions.Partitioner;import org.apache.flink.api.common.functions.RichFilterFunction;import org.apache.flink.api.common.functions.RichFlatMapFunction;import org.apache.flink.api.common.functions.RichMapFunction;import org.apache.flink.api.common.io.OutputFormat;import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo;import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo;import org.apache.flink.api.common.typeinfo.TypeInformation;import org.apache.flink.api.java.Utils;import org.apache.flink.api.java.functions.KeySelector;import org.apache.flink.api.java.io.CsvOutputFormat;import org.apache.flink.api.java.io.TextOutputFormat;import org.apache.flink.api.common.operators.Keys;import org.apache.flink.api.java.tuple.Tuple;import org.apache.flink.api.java.typeutils.InputTypeConfigurable;import org.apache.flink.api.java.typeutils.TypeExtractor;import org.apache.flink.core.fs.FileSystem.WriteMode;import org.apache.flink.core.fs.Path;import org.apache.flink.streaming.api.TimeCharacteristic;import org.apache.flink.streaming.api.collector.selector.OutputSelector;import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks;import org.apache.flink.streaming.api.functions.AssignerWithPunctuatedWatermarks;import org.apache.flink.streaming.api.functions.TimestampExtractor;import org.apache.flink.streaming.api.functions.sink.OutputFormatSinkFunction;import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction;import org.apache.flink.streaming.api.functions.sink.SinkFunction;import org.apache.flink.streaming.api.functions.sink.SocketClientSink;import org.apache.flink.streaming.api.operators.OneInputStreamOperator;import org.apache.flink.streaming.api.operators.StreamFilter;import org.apache.flink.streaming.api.operators.StreamFlatMap;import org.apache.flink.streaming.api.operators.StreamMap;import org.apache.flink.streaming.api.operators.StreamSink;import org.apache.flink.streaming.api.transformations.OneInputTransformation;import org.apache.flink.streaming.api.transformations.PartitionTransformation;import org.apache.flink.streaming.api.transformations.StreamTransformation;import org.apache.flink.streaming.api.transformations.UnionTransformation;import org.apache.flink.streaming.api.windowing.assigners.GlobalWindows;import org.apache.flink.streaming.api.windowing.assigners.SlidingProcessingTimeWindows;import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;import org.apache.flink.streaming.api.windowing.assigners.WindowAssigner;import org.apache.flink.streaming.api.windowing.evictors.CountEvictor;import org.apache.flink.streaming.api.windowing.time.Time;import org.apache.flink.streaming.api.windowing.triggers.CountTrigger;import org.apache.flink.streaming.api.windowing.triggers.PurgingTrigger;import org.apache.flink.streaming.api.windowing.windows.GlobalWindow;import org.apache.flink.streaming.api.windowing.windows.TimeWindow;import org.apache.flink.streaming.api.windowing.windows.Window;import org.apache.flink.streaming.runtime.operators.ExtractTimestampsOperator;import org.apache.flink.streaming.runtime.operators.TimestampsAndPeriodicWatermarksOperator;import org.apache.flink.streaming.runtime.operators.TimestampsAndPunctuatedWatermarksOperator;import org.apache.flink.streaming.runtime.partitioner.BroadcastPartitioner;import org.apache.flink.streaming.runtime.partitioner.CustomPartitionerWrapper;import org.apache.flink.streaming.runtime.partitioner.ForwardPartitioner;import org.apache.flink.streaming.runtime.partitioner.RescalePartitioner;import org.apache.flink.streaming.runtime.partitioner.RebalancePartitioner;import org.apache.flink.streaming.runtime.partitioner.GlobalPartitioner;import org.apache.flink.streaming.runtime.partitioner.ShufflePartitioner;import org.apache.flink.streaming.runtime.partitioner.StreamPartitioner;import org.apache.flink.streaming.util.keys.KeySelectorUtil;import org.apache.flink.streaming.util.serialization.SerializationSchema;import com.google.common.base.Preconditions;/** * A DataStream represents a stream of elements of the same type. A DataStream * can be transformed into another DataStream by applying a transformation as * for example: * <ul> * <li>{@link DataStream#map} * <li>{@link DataStream#filter} * </ul> * * @param <T> The type of the elements in this stream. */@Publicpublic class DataStream<T> {protected final StreamExecutionEnvironment environment;protected final StreamTransformation<T> transformation;/** * Create a new {@link DataStream} in the given execution environment with * partitioning set to forward by default. * * @param environment The StreamExecutionEnvironment */public DataStream(StreamExecutionEnvironment environment, StreamTransformation<T> transformation) {this.environment = Preconditions.checkNotNull(environment, "Execution Environment must not be null.");this.transformation = Preconditions.checkNotNull(transformation, "Stream Transformation must not be null.");}/** * Returns the ID of the {@link DataStream} in the current {@link StreamExecutionEnvironment}. * * @return ID of the DataStream */@Internalpublic int getId() {return transformation.getId();}/** * Gets the parallelism for this operator. * * @return The parallelism set for this operator. */public int getParallelism() {return transformation.getParallelism();}/** * Gets the type of the stream. * * @return The type of the datastream. */public TypeInformation<T> getType() {return transformation.getOutputType();}/** * Invokes the {@link org.apache.flink.api.java.ClosureCleaner} * on the given function if closure cleaning is enabled in the {@link ExecutionConfig}. * * @return The cleaned Function */protected <F> F clean(F f) {return getExecutionEnvironment().clean(f);}/** * Returns the {@link StreamExecutionEnvironment} that was used to create this * {@link DataStream} * * @return The Execution Environment */public StreamExecutionEnvironment getExecutionEnvironment() {return environment;}public ExecutionConfig getExecutionConfig() {return environment.getConfig();}/** * Creates a new {@link DataStream} by merging {@link DataStream} outputs of * the same type with each other. The DataStreams merged using this operator * will be transformed simultaneously. * * @param streams *            The DataStreams to union output with. * @return The {@link DataStream}. */@SafeVarargspublic final DataStream<T> union(DataStream<T>... streams) {List<StreamTransformation<T>> unionedTransforms = new ArrayList<>();unionedTransforms.add(this.transformation);for (DataStream<T> newStream : streams) {if (!getType().equals(newStream.getType())) {throw new IllegalArgumentException("Cannot union streams of different types: "+ getType() + " and " + newStream.getType());}unionedTransforms.add(newStream.getTransformation());}return new DataStream<>(this.environment, new UnionTransformation<>(unionedTransforms));}/** * Operator used for directing tuples to specific named outputs using an * {@link org.apache.flink.streaming.api.collector.selector.OutputSelector}. * Calling this method on an operator creates a new {@link SplitStream}. * * @param outputSelector *            The user defined *            {@link org.apache.flink.streaming.api.collector.selector.OutputSelector} *            for directing the tuples. * @return The {@link SplitStream} */public SplitStream<T> split(OutputSelector<T> outputSelector) {return new SplitStream<>(this, clean(outputSelector));}/** * Creates a new {@link ConnectedStreams} by connecting * {@link DataStream} outputs of (possible) different types with each other. * The DataStreams connected using this operator can be used with * CoFunctions to apply joint transformations. * * @param dataStream *            The DataStream with which this stream will be connected. * @return The {@link ConnectedStreams}. */public <R> ConnectedStreams<T, R> connect(DataStream<R> dataStream) {return new ConnectedStreams<>(environment, this, dataStream);}/** * * It creates a new {@link KeyedStream} that uses the provided key for partitioning * its operator states.  * * @param key *            The KeySelector to be used for extracting the key for partitioning * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) */public <K> KeyedStream<T, K> keyBy(KeySelector<T, K> key) {return new KeyedStream<>(this, clean(key));}/** * Partitions the operator state of a {@link DataStream} by the given key positions.  * * @param fields *            The position of the fields on which the {@link DataStream} *            will be grouped. * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) */public KeyedStream<T, Tuple> keyBy(int... fields) {if (getType() instanceof BasicArrayTypeInfo || getType() instanceof PrimitiveArrayTypeInfo) {return keyBy(KeySelectorUtil.getSelectorForArray(fields, getType()));} else {return keyBy(new Keys.ExpressionKeys<>(fields, getType()));}}/** * Partitions the operator state of a {@link DataStream}using field expressions.  * A field expression is either the name of a public field or a getter method with parentheses * of the {@link DataStream}S underlying type. A dot can be used to drill * down into objects, as in {@code "field1.getInnerField2()" }. * * @param fields *            One or more field expressions on which the state of the {@link DataStream} operators will be *            partitioned. * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) **/public KeyedStream<T, Tuple> keyBy(String... fields) {return keyBy(new Keys.ExpressionKeys<>(fields, getType()));}private KeyedStream<T, Tuple> keyBy(Keys<T> keys) {return new KeyedStream<>(this, clean(KeySelectorUtil.getSelectorForKeys(keys,getType(), getExecutionConfig())));}/** * Partitions a tuple DataStream on the specified key fields using a custom partitioner. * This method takes the key position to partition on, and a partitioner that accepts the key type. * <p> * Note: This method works only on single field keys. * * @param partitioner The partitioner to assign partitions to keys. * @param field The field index on which the DataStream is to partitioned. * @return The partitioned DataStream. */public <K> DataStream<T> partitionCustom(Partitioner<K> partitioner, int field) {Keys.ExpressionKeys<T> outExpressionKeys = new Keys.ExpressionKeys<>(new int[]{field}, getType());return partitionCustom(partitioner, outExpressionKeys);}/** * Partitions a POJO DataStream on the specified key fields using a custom partitioner. * This method takes the key expression to partition on, and a partitioner that accepts the key type. * <p> * Note: This method works only on single field keys. * * @param partitioner The partitioner to assign partitions to keys. * @param field The expression for the field on which the DataStream is to partitioned. * @return The partitioned DataStream. */public <K> DataStream<T> partitionCustom(Partitioner<K> partitioner, String field) {Keys.ExpressionKeys<T> outExpressionKeys = new Keys.ExpressionKeys<>(new String[]{field}, getType());return partitionCustom(partitioner, outExpressionKeys);}/** * Partitions a DataStream on the key returned by the selector, using a custom partitioner. * This method takes the key selector to get the key to partition on, and a partitioner that * accepts the key type. * <p> * Note: This method works only on single field keys, i.e. the selector cannot return tuples * of fields. * * @param partitioner * The partitioner to assign partitions to keys. * @param keySelector * The KeySelector with which the DataStream is partitioned. * @return The partitioned DataStream. * @see KeySelector */public <K> DataStream<T> partitionCustom(Partitioner<K> partitioner, KeySelector<T, K> keySelector) {return setConnectionType(new CustomPartitionerWrapper<>(clean(partitioner),clean(keySelector)));}//private helper method for custom partitioningprivate <K> DataStream<T> partitionCustom(Partitioner<K> partitioner, Keys<T> keys) {KeySelector<T, K> keySelector = KeySelectorUtil.getSelectorForOneKey(keys, partitioner, getType(), getExecutionConfig());return setConnectionType(new CustomPartitionerWrapper<>(clean(partitioner),clean(keySelector)));}/** * Sets the partitioning of the {@link DataStream} so that the output elements * are broadcasted to every parallel instance of the next operation. * * @return The DataStream with broadcast partitioning set. */public DataStream<T> broadcast() {return setConnectionType(new BroadcastPartitioner<T>());}/** * Sets the partitioning of the {@link DataStream} so that the output elements * are shuffled uniformly randomly to the next operation. * * @return The DataStream with shuffle partitioning set. */@PublicEvolvingpublic DataStream<T> shuffle() {return setConnectionType(new ShufflePartitioner<T>());}/** * Sets the partitioning of the {@link DataStream} so that the output elements * are forwarded to the local subtask of the next operation. * * @return The DataStream with forward partitioning set. */public DataStream<T> forward() {return setConnectionType(new ForwardPartitioner<T>());}/** * Sets the partitioning of the {@link DataStream} so that the output elements * are distributed evenly to instances of the next operation in a round-robin * fashion. * * @return The DataStream with rebalance partitioning set. */public DataStream<T> rebalance() {return setConnectionType(new RebalancePartitioner<T>());}/** * Sets the partitioning of the {@link DataStream} so that the output elements * are distributed evenly to a subset of instances of the next operation in a round-robin * fashion. * * <p>The subset of downstream operations to which the upstream operation sends * elements depends on the degree of parallelism of both the upstream and downstream operation. * For example, if the upstream operation has parallelism 2 and the downstream operation * has parallelism 4, then one upstream operation would distribute elements to two * downstream operations while the other upstream operation would distribute to the other * two downstream operations. If, on the other hand, the downstream operation has parallelism * 2 while the upstream operation has parallelism 4 then two upstream operations will * distribute to one downstream operation while the other two upstream operations will * distribute to the other downstream operations. * * <p>In cases where the different parallelisms are not multiples of each other one or several * downstream operations will have a differing number of inputs from upstream operations. * * @return The DataStream with rescale partitioning set. */@PublicEvolvingpublic DataStream<T> rescale() {return setConnectionType(new RescalePartitioner<T>());}/** * Sets the partitioning of the {@link DataStream} so that the output values * all go to the first instance of the next processing operator. Use this * setting with care since it might cause a serious performance bottleneck * in the application. * * @return The DataStream with shuffle partitioning set. */@PublicEvolvingpublic DataStream<T> global() {return setConnectionType(new GlobalPartitioner<T>());}/** * Initiates an iterative part of the program that feeds back data streams. * The iterative part needs to be closed by calling * {@link IterativeStream#closeWith(DataStream)}. The transformation of * this IterativeStream will be the iteration head. The data stream * given to the {@link IterativeStream#closeWith(DataStream)} method is * the data stream that will be fed back and used as the input for the * iteration head. The user can also use different feedback type than the * input of the iteration and treat the input and feedback streams as a * {@link ConnectedStreams} be calling * {@link IterativeStream#withFeedbackType(TypeInformation)} * <p> * A common usage pattern for streaming iterations is to use output * splitting to send a part of the closing data stream to the head. Refer to * {@link #split(OutputSelector)} for more information. * <p> * The iteration edge will be partitioned the same way as the first input of * the iteration head unless it is changed in the * {@link IterativeStream#closeWith(DataStream)} call. * <p> * By default a DataStream with iteration will never terminate, but the user * can use the maxWaitTime parameter to set a max waiting time for the * iteration head. If no data received in the set time, the stream * terminates. * * @return The iterative data stream created. */@PublicEvolvingpublic IterativeStream<T> iterate() {return new IterativeStream<>(this, 0);}/** * Initiates an iterative part of the program that feeds back data streams. * The iterative part needs to be closed by calling * {@link IterativeStream#closeWith(DataStream)}. The transformation of * this IterativeStream will be the iteration head. The data stream * given to the {@link IterativeStream#closeWith(DataStream)} method is * the data stream that will be fed back and used as the input for the * iteration head. The user can also use different feedback type than the * input of the iteration and treat the input and feedback streams as a * {@link ConnectedStreams} be calling * {@link IterativeStream#withFeedbackType(TypeInformation)} * <p> * A common usage pattern for streaming iterations is to use output * splitting to send a part of the closing data stream to the head. Refer to * {@link #split(OutputSelector)} for more information. * <p> * The iteration edge will be partitioned the same way as the first input of * the iteration head unless it is changed in the * {@link IterativeStream#closeWith(DataStream)} call. * <p> * By default a DataStream with iteration will never terminate, but the user * can use the maxWaitTime parameter to set a max waiting time for the * iteration head. If no data received in the set time, the stream * terminates. * * @param maxWaitTimeMillis *            Number of milliseconds to wait between inputs before shutting *            down * * @return The iterative data stream created. */@PublicEvolvingpublic IterativeStream<T> iterate(long maxWaitTimeMillis) {return new IterativeStream<>(this, maxWaitTimeMillis);}/** * Applies a Map transformation on a {@link DataStream}. The transformation * calls a {@link MapFunction} for each element of the DataStream. Each * MapFunction call returns exactly one element. The user can also extend * {@link RichMapFunction} to gain access to other features provided by the * {@link org.apache.flink.api.common.functions.RichFunction} interface. * * @param mapper *            The MapFunction that is called for each element of the *            DataStream. * @param <R> *            output type * @return The transformed {@link DataStream}. */public <R> SingleOutputStreamOperator<R> map(MapFunction<T, R> mapper) {TypeInformation<R> outType = TypeExtractor.getMapReturnTypes(clean(mapper), getType(),Utils.getCallLocationName(), true);return transform("Map", outType, new StreamMap<>(clean(mapper)));}/** * Applies a FlatMap transformation on a {@link DataStream}. The * transformation calls a {@link FlatMapFunction} for each element of the * DataStream. Each FlatMapFunction call can return any number of elements * including none. The user can also extend {@link RichFlatMapFunction} to * gain access to other features provided by the * {@link org.apache.flink.api.common.functions.RichFunction} interface. * * @param flatMapper *            The FlatMapFunction that is called for each element of the *            DataStream * * @param <R> *            output type * @return The transformed {@link DataStream}. */public <R> SingleOutputStreamOperator<R> flatMap(FlatMapFunction<T, R> flatMapper) {TypeInformation<R> outType = TypeExtractor.getFlatMapReturnTypes(clean(flatMapper),getType(), Utils.getCallLocationName(), true);return transform("Flat Map", outType, new StreamFlatMap<>(clean(flatMapper)));}/** * Applies a Filter transformation on a {@link DataStream}. The * transformation calls a {@link FilterFunction} for each element of the * DataStream and retains only those element for which the function returns * true. Elements for which the function returns false are filtered. The * user can also extend {@link RichFilterFunction} to gain access to other * features provided by the * {@link org.apache.flink.api.common.functions.RichFunction} interface. * * @param filter *            The FilterFunction that is called for each element of the *            DataStream. * @return The filtered DataStream. */public SingleOutputStreamOperator<T> filter(FilterFunction<T> filter) {return transform("Filter", getType(), new StreamFilter<>(clean(filter)));}/** * Initiates a Project transformation on a {@link Tuple} {@link DataStream}.<br> * <b>Note: Only Tuple DataStreams can be projected.</b> * * <p> * The transformation projects each Tuple of the DataSet onto a (sub)set of * fields. * * @param fieldIndexes *            The field indexes of the input tuples that are retained. The *            order of fields in the output tuple corresponds to the order *            of field indexes. * @return The projected DataStream * * @see Tuple * @see DataStream */@PublicEvolvingpublic <R extends Tuple> SingleOutputStreamOperator<R> project(int... fieldIndexes) {return new StreamProjection<>(this, fieldIndexes).projectTupleX();}/** * Creates a join operation. See {@link CoGroupedStreams} for an example of how the keys * and window can be specified. */public <T2> CoGroupedStreams<T, T2> coGroup(DataStream<T2> otherStream) {return new CoGroupedStreams<>(this, otherStream);}/** * Creates a join operation. See {@link JoinedStreams} for an example of how the keys * and window can be specified. */public <T2> JoinedStreams<T, T2> join(DataStream<T2> otherStream) {return new JoinedStreams<>(this, otherStream);}/** * Windows this {@code DataStream} into tumbling time windows. * * <p> * This is a shortcut for either {@code .window(TumblingEventTimeWindows.of(size))} or * {@code .window(TumblingProcessingTimeWindows.of(size))} depending on the time characteristic * set using * * <p> * Note: This operation can be inherently non-parallel since all elements have to pass through * the same operator instance. (Only for special cases, such as aligned time windows is * it possible to perform this operation in parallel). * * {@link org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#setStreamTimeCharacteristic(org.apache.flink.streaming.api.TimeCharacteristic)} * * @param size The size of the window. */public AllWindowedStream<T, TimeWindow> timeWindowAll(Time size) {if (environment.getStreamTimeCharacteristic() == TimeCharacteristic.ProcessingTime) {return windowAll(TumblingProcessingTimeWindows.of(size));} else {return windowAll(TumblingEventTimeWindows.of(size));}}/** * Windows this {@code DataStream} into sliding time windows. * * <p> * This is a shortcut for either {@code .window(SlidingEventTimeWindows.of(size, slide))} or * {@code .window(SlidingProcessingTimeWindows.of(size, slide))} depending on the time characteristic * set using * {@link org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#setStreamTimeCharacteristic(org.apache.flink.streaming.api.TimeCharacteristic)} * * <p> * Note: This operation can be inherently non-parallel since all elements have to pass through * the same operator instance. (Only for special cases, such as aligned time windows is * it possible to perform this operation in parallel). * * @param size The size of the window. */public AllWindowedStream<T, TimeWindow> timeWindowAll(Time size, Time slide) {if (environment.getStreamTimeCharacteristic() == TimeCharacteristic.ProcessingTime) {return windowAll(SlidingProcessingTimeWindows.of(size, slide));} else {return windowAll(SlidingEventTimeWindows.of(size, slide));}}/** * Windows this {@code DataStream} into tumbling count windows. * * <p> * Note: This operation can be inherently non-parallel since all elements have to pass through * the same operator instance. (Only for special cases, such as aligned time windows is * it possible to perform this operation in parallel). * * @param size The size of the windows in number of elements. */public AllWindowedStream<T, GlobalWindow> countWindowAll(long size) {return windowAll(GlobalWindows.create()).trigger(PurgingTrigger.of(CountTrigger.of(size)));}/** * Windows this {@code DataStream} into sliding count windows. * * <p> * Note: This operation can be inherently non-parallel since all elements have to pass through * the same operator instance. (Only for special cases, such as aligned time windows is * it possible to perform this operation in parallel). * * @param size The size of the windows in number of elements. * @param slide The slide interval in number of elements. */public AllWindowedStream<T, GlobalWindow> countWindowAll(long size, long slide) {return windowAll(GlobalWindows.create()).evictor(CountEvictor.of(size)).trigger(CountTrigger.of(slide));}/** * Windows this data stream to a {@code KeyedTriggerWindowDataStream}, which evaluates windows * over a key grouped stream. Elements are put into windows by a * {@link org.apache.flink.streaming.api.windowing.assigners.WindowAssigner}. The grouping of * elements is done both by key and by window. * * <p> * A {@link org.apache.flink.streaming.api.windowing.triggers.Trigger} can be defined to specify * when windows are evaluated. However, {@code WindowAssigners} have a default {@code Trigger} * that is used if a {@code Trigger} is not specified. * * <p> * Note: This operation can be inherently non-parallel since all elements have to pass through * the same operator instance. (Only for special cases, such as aligned time windows is * it possible to perform this operation in parallel). * * @param assigner The {@code WindowAssigner} that assigns elements to windows. * @return The trigger windows data stream. */@PublicEvolvingpublic <W extends Window> AllWindowedStream<T, W> windowAll(WindowAssigner<? super T, W> assigner) {return new AllWindowedStream<>(this, assigner);}// ------------------------------------------------------------------------//  Timestamps and watermarks// ------------------------------------------------------------------------/** * Extracts a timestamp from an element and assigns it as the internal timestamp of that element. * The internal timestamps are, for example, used to to event-time window operations. * * <p> * If you know that the timestamps are strictly increasing you can use an * {@link org.apache.flink.streaming.api.functions.AscendingTimestampExtractor}. Otherwise, * you should provide a {@link TimestampExtractor} that also implements * {@link TimestampExtractor#getCurrentWatermark()} to keep track of watermarks. * * @param extractor The TimestampExtractor that is called for each element of the DataStream. *  * @deprecated Please use {@link #assignTimestampsAndWatermarks(AssignerWithPeriodicWatermarks)} *             of {@link #assignTimestampsAndWatermarks(AssignerWithPunctuatedWatermarks)} *             instread. * @see #assignTimestampsAndWatermarks(AssignerWithPeriodicWatermarks) * @see #assignTimestampsAndWatermarks(AssignerWithPunctuatedWatermarks) */@Deprecatedpublic SingleOutputStreamOperator<T> assignTimestamps(TimestampExtractor<T> extractor) {// match parallelism to input, otherwise dop=1 sources could lead to some strange// behaviour: the watermark will creep along very slowly because the elements// from the source go to each extraction operator round robin.int inputParallelism = getTransformation().getParallelism();ExtractTimestampsOperator<T> operator = new ExtractTimestampsOperator<>(clean(extractor));return transform("ExtractTimestamps", getTransformation().getOutputType(), operator).setParallelism(inputParallelism);}/** * Assigns timestamps to the elements in the data stream and periodically creates * watermarks to signal event time progress. *  * <p>This method creates watermarks periodically (for example every second), based * on the watermarks indicated by the given watermark generator. Even when no new elements * in the stream arrive, the given watermark generator will be periodically checked for * new watermarks. The interval in which watermarks are generated is defined in * {@link ExecutionConfig#setAutoWatermarkInterval(long)}. *  * <p>Use this method for the common cases, where some characteristic over all elements * should generate the watermarks, or where watermarks are simply trailing behind the * wall clock time by a certain amount. *  * <p>For cases where watermarks should be created in an irregular fashion, for example * based on certain markers that some element carry, use the * {@link AssignerWithPunctuatedWatermarks}. *  * @param timestampAndWatermarkAssigner The implementation of the timestamp assigner and *                                      watermark generator.    * @return The stream after the transformation, with assigned timestamps and watermarks. *  * @see AssignerWithPeriodicWatermarks * @see AssignerWithPunctuatedWatermarks * @see #assignTimestampsAndWatermarks(AssignerWithPunctuatedWatermarks)  */public SingleOutputStreamOperator<T> assignTimestampsAndWatermarks(AssignerWithPeriodicWatermarks<T> timestampAndWatermarkAssigner) {// match parallelism to input, otherwise dop=1 sources could lead to some strange// behaviour: the watermark will creep along very slowly because the elements// from the source go to each extraction operator round robin.final int inputParallelism = getTransformation().getParallelism();final AssignerWithPeriodicWatermarks<T> cleanedAssigner = clean(timestampAndWatermarkAssigner);TimestampsAndPeriodicWatermarksOperator<T> operator = new TimestampsAndPeriodicWatermarksOperator<>(cleanedAssigner);return transform("Timestamps/Watermarks", getTransformation().getOutputType(), operator).setParallelism(inputParallelism);}/** * Assigns timestamps to the elements in the data stream and periodically creates * watermarks to signal event time progress. * * <p>This method creates watermarks based purely on stream elements. For each element * that is handled via {@link AssignerWithPunctuatedWatermarks#extractTimestamp(Object, long)}, * the {@link AssignerWithPunctuatedWatermarks#checkAndGetNextWatermark(Object, long)} * method is called, and a new watermark is emitted, if the returned watermark value is * non-negative and greater than the previous watermark. * * <p>This method is useful when the data stream embeds watermark elements, or certain elements * carry a marker that can be used to determine the current event time watermark.  * This operation gives the programmer full control over the watermark generation. Users * should be aware that too aggressive watermark generation (i.e., generating hundreds of * watermarks every second) can cost some performance. * * <p>For cases where watermarks should be created in a regular fashion, for example * every x milliseconds, use the {@link AssignerWithPeriodicWatermarks}. * * @param timestampAndWatermarkAssigner The implementation of the timestamp assigner and *                                      watermark generator.    * @return The stream after the transformation, with assigned timestamps and watermarks. * * @see AssignerWithPunctuatedWatermarks * @see AssignerWithPeriodicWatermarks * @see #assignTimestampsAndWatermarks(AssignerWithPeriodicWatermarks) */public SingleOutputStreamOperator<T> assignTimestampsAndWatermarks(AssignerWithPunctuatedWatermarks<T> timestampAndWatermarkAssigner) {// match parallelism to input, otherwise dop=1 sources could lead to some strange// behaviour: the watermark will creep along very slowly because the elements// from the source go to each extraction operator round robin.final int inputParallelism = getTransformation().getParallelism();final AssignerWithPunctuatedWatermarks<T> cleanedAssigner = clean(timestampAndWatermarkAssigner);TimestampsAndPunctuatedWatermarksOperator<T> operator = new TimestampsAndPunctuatedWatermarksOperator<>(cleanedAssigner);return transform("Timestamps/Watermarks", getTransformation().getOutputType(), operator).setParallelism(inputParallelism);}// ------------------------------------------------------------------------//  Data sinks// ------------------------------------------------------------------------/** * Writes a DataStream to the standard output stream (stdout). * * <p> * For each element of the DataStream the result of * {@link Object#toString()} is written. * * @return The closed DataStream. */@PublicEvolvingpublic DataStreamSink<T> print() {PrintSinkFunction<T> printFunction = new PrintSinkFunction<>();return addSink(printFunction);}/** * Writes a DataStream to the standard output stream (stderr). * * <p> * For each element of the DataStream the result of * {@link Object#toString()} is written. * * @return The closed DataStream. */@PublicEvolvingpublic DataStreamSink<T> printToErr() {PrintSinkFunction<T> printFunction = new PrintSinkFunction<>(true);return addSink(printFunction);}/** * Writes a DataStream to the file specified by path in text format. * * <p> * For every element of the DataStream the result of {@link Object#toString()} * is written. * * @param path *            The path pointing to the location the text file is written to. * * @return The closed DataStream. */@PublicEvolvingpublic DataStreamSink<T> writeAsText(String path) {return writeUsingOutputFormat(new TextOutputFormat<T>(new Path(path)));}/** * Writes a DataStream to the file specified by path in text format. * * <p> * For every element of the DataStream the result of {@link Object#toString()} * is written. * * @param path *            The path pointing to the location the text file is written to * @param writeMode *            Controls the behavior for existing files. Options are *            NO_OVERWRITE and OVERWRITE. * * @return The closed DataStream. */@PublicEvolvingpublic DataStreamSink<T> writeAsText(String path, WriteMode writeMode) {TextOutputFormat<T> tof = new TextOutputFormat<>(new Path(path));tof.setWriteMode(writeMode);return writeUsingOutputFormat(tof);}/** * Writes a DataStream to the file specified by the path parameter. * * <p> * For every field of an element of the DataStream the result of {@link Object#toString()} * is written. This method can only be used on data streams of tuples. * * @param path *            the path pointing to the location the text file is written to * * @return the closed DataStream */@PublicEvolvingpublic DataStreamSink<T> writeAsCsv(String path) {return writeAsCsv(path, null, CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER);}/** * Writes a DataStream to the file specified by the path parameter. * * <p> * For every field of an element of the DataStream the result of {@link Object#toString()} * is written. This method can only be used on data streams of tuples. * * @param path *            the path pointing to the location the text file is written to * @param writeMode *            Controls the behavior for existing files. Options are *            NO_OVERWRITE and OVERWRITE. * * @return the closed DataStream */@PublicEvolvingpublic DataStreamSink<T> writeAsCsv(String path, WriteMode writeMode) {return writeAsCsv(path, writeMode, CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER);}/** * Writes a DataStream to the file specified by the path parameter. The * writing is performed periodically every millis milliseconds. * * <p> * For every field of an element of the DataStream the result of {@link Object#toString()} * is written. This method can only be used on data streams of tuples. * * @param path *            the path pointing to the location the text file is written to * @param writeMode *            Controls the behavior for existing files. Options are *            NO_OVERWRITE and OVERWRITE. * @param rowDelimiter *            the delimiter for two rows * @param fieldDelimiter *            the delimiter for two fields * * @return the closed DataStream */@SuppressWarnings("unchecked")@PublicEvolvingpublic <X extends Tuple> DataStreamSink<T> writeAsCsv(String path,WriteMode writeMode,String rowDelimiter,String fieldDelimiter) {Preconditions.checkArgument(getType().isTupleType(),"The writeAsCsv() method can only be used on data streams of tuples.");CsvOutputFormat<X> of = new CsvOutputFormat<>(new Path(path),rowDelimiter,fieldDelimiter);if (writeMode != null) {of.setWriteMode(writeMode);}return writeUsingOutputFormat((OutputFormat<T>) of);}/** * Writes the DataStream to a socket as a byte array. The format of the * output is specified by a {@link SerializationSchema}. * * @param hostName *            host of the socket * @param port *            port of the socket * @param schema *            schema for serialization * @return the closed DataStream */@PublicEvolvingpublic DataStreamSink<T> writeToSocket(String hostName, int port, SerializationSchema<T> schema) {DataStreamSink<T> returnStream = addSink(new SocketClientSink<>(hostName, port, schema, 0));returnStream.setParallelism(1); // It would not work if multiple instances would connect to the same portreturn returnStream;}/** * Writes the dataStream into an output, described by an OutputFormat. * * The output is not participating in Flink's checkpointing! * * For writing to a file system periodically, the use of the "flink-connector-filesystem" is recommended. * * @param format The output format * @return The closed DataStream */@PublicEvolvingpublic DataStreamSink<T> writeUsingOutputFormat(OutputFormat<T> format) {return addSink(new OutputFormatSinkFunction<>(format));}/** * Method for passing user defined operators along with the type * information that will transform the DataStream. * * @param operatorName *            name of the operator, for logging purposes * @param outTypeInfo *            the output type of the operator * @param operator *            the object containing the transformation logic * @param <R> *            type of the return stream * @return the data stream constructed */@PublicEvolvingpublic <R> SingleOutputStreamOperator<R> transform(String operatorName, TypeInformation<R> outTypeInfo, OneInputStreamOperator<T, R> operator) {// read the output type of the input Transform to coax out errors about MissingTypeInfotransformation.getOutputType();OneInputTransformation<T, R> resultTransform = new OneInputTransformation<>(this.transformation,operatorName,operator,outTypeInfo,environment.getParallelism());@SuppressWarnings({ "unchecked", "rawtypes" })SingleOutputStreamOperator<R> returnStream = new SingleOutputStreamOperator(environment, resultTransform);getExecutionEnvironment().addOperator(resultTransform);return returnStream;}/** * Internal function for setting the partitioner for the DataStream * * @param partitioner *            Partitioner to set. * @return The modified DataStream. */protected DataStream<T> setConnectionType(StreamPartitioner<T> partitioner) {return new DataStream<>(this.getExecutionEnvironment(), new PartitionTransformation<>(this.getTransformation(), partitioner));}/** * Adds the given sink to this DataStream. Only streams with sinks added * will be executed once the {@link StreamExecutionEnvironment#execute()} * method is called. * * @param sinkFunction *            The object containing the sink's invoke function. * @return The closed DataStream. */public DataStreamSink<T> addSink(SinkFunction<T> sinkFunction) {// read the output type of the input Transform to coax out errors about MissingTypeInfotransformation.getOutputType();// configure the type if neededif (sinkFunction instanceof InputTypeConfigurable) {((InputTypeConfigurable) sinkFunction).setInputType(getType(), getExecutionConfig() );}StreamSink<T> sinkOperator = new StreamSink<>(clean(sinkFunction));DataStreamSink<T> sink = new DataStreamSink<>(this, sinkOperator);getExecutionEnvironment().addOperator(sink.getTransformation());return sink;}/** * Returns the {@link StreamTransformation} that represents the operation that logically creates * this {@link DataStream}. * * @return The Transformation */@Internalpublic StreamTransformation<T> getTransformation() {return transformation;}}


0 0
原创粉丝点击