13.Spark SQL:UDF自定义函数实战

来源:互联网 发布:学生赚钱软件 编辑:程序博客网 时间:2024/05/16 12:03
UDF:User Defined Function。用户自定义函数。

scala版本
package cn.spark.study.sqlimport org.apache.spark.SparkConf;import org.apache.spark.SparkContext;import org.apache.spark.sql.SQLContext;import org.apache.spark.sql.DataFrame;import org.apache.spark.sql.Row;import org.apache.spark.sql.types.StructType;import org.apache.spark.sql.types.StringType;import org.apache.spark.sql.types.DoubleType;import org.apache.spark.sql.types.StructField;// 手动导入一个函数import org.apache.spark.sql.functions._object UDF {  def main(args:Array[String]){    val conf = new SparkConf()      .setMaster("local")      .setAppName("UDF")    val sc = new SparkContext(conf);    val sqlContext = new SQLContext(sc);        // 构造模拟数据    val names = Array("Leo", "Mary", "Jack", "Tom")    val namesRDD = sc.parallelize(names, 5)    val namesRowRDD = namesRDD.map{name => Row(name)}    val structType = StructType(Array(StructField("name", StringType, true)))    val namesDF = sqlContext.createDataFrame(namesRowRDD, structType)            // 注册一张names表    namesDF.registerTempTable("names")        // 定义和注册自定义函数    // 定义函数:自己写匿名函数    // 注册函数:SQLContext.udf.reqister()    sqlContext.udf.register("strLen", (str:String) => str.length())        // 使用自定义函数    sqlContext.sql("select name, strLen(name) from names")      .collect()      .foreach(println)  }}

java版本

package cn.spark.study.sql;import java.util.Arrays;import java.util.List;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import org.apache.spark.sql.DataFrame;import org.apache.spark.sql.Row;import org.apache.spark.sql.RowFactory;import org.apache.spark.sql.SQLContext;import org.apache.spark.sql.api.java.UDF1;import org.apache.spark.sql.types.DataTypes;import org.apache.spark.sql.types.StructField;import org.apache.spark.sql.types.StructType;/** * SPARK sql编写UDF自定义函数(JAVA) * @author leizq120310 * */public class UDF {public static void main(String[] args) {SparkConf conf = new SparkConf().setMaster("local").setAppName("UDF");JavaSparkContext sc = new JavaSparkContext(conf);SQLContext sqlContext = new SQLContext(sc);// 构造模拟数据List<String> names = Arrays.asList("Leo", "Mary", "Jack", "Tom");JavaRDD<String> namesRDD = sc.parallelize(names);JavaRDD<Row> namesRowRDD = namesRDD.map(new Function<String, Row>() {private static final long serialVersionUID = 1L;@Overridepublic Row call(String name) throws Exception {// TODO Auto-generated method stubreturn RowFactory.create(name);}});// 构造元数据List<StructField> structFields = Arrays.asList(DataTypes.createStructField("name", DataTypes.StringType, true));StructType structType = DataTypes.createStructType(structFields);// 创建DataFrameDataFrame namesDF = sqlContext.createDataFrame(namesRowRDD, structType);// 注册一张names表namesDF.registerTempTable("names");    // 定义和注册自定义函数    // 定义函数:自己写匿名函数    // 注册函数:SQLContext.udf().reqister()/** * Function可以使用UDF1到UDF22/21,所表达的意思就是几个参数,2代就指两个参数,10代指10个参数 * return 返回的即为UDF<> 的最后一个参数 */sqlContext.udf().register("strLen", new UDF1<String, Integer>() {private static final long serialVersionUID = 1L;@Overridepublic Integer call(String name) throws Exception {// TODO Auto-generated method stubreturn name.length();}}, DataTypes.IntegerType);    // 使用自定义函数,查询数据List<Row> rows = sqlContext.sql("select name, strLen(name) from names").javaRDD().collect();// 打印数据for (Row row : rows){System.out.println("name:" + row.get(0) + "  长度:" + row.get(1));}      // 关闭JavaSparkConfsc.close();}}