《Spark SQL大数据实例开发》9.2 综合案例实战——电商网站搜索排名统计

来源:互联网 发布:赤峰市用友软件销售 编辑:程序博客网 时间:2024/05/17 03:37

《Spark SQL大数据实例开发》9.2 综合案例实战——电商网站搜索排名统计

9.2.1 案例概述
    本节演示一个网站搜索综合案例:以京东为例,用户登录京东网站,在搜索栏中输入搜索词,然后点击搜索按钮,就能在京东网站搜索用户需要的商品。在搜索栏中输入搜索词时,当用户输入第一个词的时候,京东就能根据用户的点击商品搜索排名,自动在搜索栏下拉列表中显示搜索热词,帮助用户快捷的点击需搜索的商品。在网站搜索综合案例中,将实现和京东搜索类似的功能,根据用户搜索词的日志记录,将用户每天搜索排名前3名的商品列出来,系统后台可以将搜索排名记录持久化到数据库中,提供给web系统或其他应用使用。这里将搜索排名前3名记录保存到磁盘文件系统中,以json格式保存。
网站搜索综合案例代码分2个模块:
(1)数据生成模块:模拟数据的生成可以使用爬虫代码程序,从网络上爬取相应的用户搜索数据,进行ETL数据清理。为简化数据爬取和清洗过程,我们采用模拟生成数据的方式,根据综合案例的数据需求,人工生成模拟数据文件,实现同样类似的功能。
(2)网站搜索排名:找出用户每天搜索排名前3名的产品。


数据生成代码: Spark SQLUserlogsHottestDataManually.java

package com.dt.imf;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.io.PrintWriter;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.Calendar;import java.util.Random;import java.util.UUID;public class Spark SQLUserlogsHottestDataManually {public static void main(String[] args) {long numberItems = 10000;ganerateUserLogs(numberItems, "G:\\Spark SQLData\\");}/** * 生成userLog *  * @paramnumberItems * @param path */private static void ganerateUserLogs(long numberItems, String path) {StringBufferuserLogBuffer = new StringBuffer();String filename = "Spark SQLUserlogsHot.log";// 元数据:Date、UserID、Item、City、Device;for (int i = 0; i <numberItems; i++) {String date = getCountDate(null, "yyyy-MM-dd", -1);String userID = ganerateUserID();String ItemID = ganerateItemID();String CityID = ganerateCityIDs();String Device = ganerateDevice();userLogBuffer.append(date + "\t" + userID + "\t" + ItemID + "\t" + CityID + "\t" + Device + "\n");WriteLog(path, filename, userLogBuffer + "");}}public static void WriteLog(String path, String filename, String strUserLog) {FileWriter fw = null;PrintWriter out = null;try {File writeFile = new File(path + filename);if (!writeFile.exists())writeFile.createNewFile();else {writeFile.delete();}fw = new FileWriter(writeFile, true);out = new PrintWriter(fw);out.print(strUserLog);} catch (Exception e) {e.printStackTrace();try {if (out != null)out.close();if (fw != null)fw.close();} catch (IOException ex) {ex.printStackTrace();}} finally {try {if (out != null)out.close();if (fw != null)fw.close();} catch (IOException e) {e.printStackTrace();}}}/** * 获得日期 *  * @param date * @param patton * @param step * @return */public static String getCountDate(String date, String patton, int step) {SimpleDateFormatsdf = new SimpleDateFormat(patton);Calendar cal = Calendar.getInstance();if (date != null) {try {cal.setTime(sdf.parse(date));} catch (ParseException e) {e.printStackTrace();}}cal.add(Calendar.DAY_OF_MONTH, step);return sdf.format(cal.getTime());}/** * 随机生成用户ID *  * @return */private static String ganerateUserID() {Random random = new Random();String[] userID = { "98415b9c-f3d4-45c3-bc7f-dce3126c6c0b", "7371b4bd-8535-461f-a5e2-c4814b2151e1","49852bfa-a662-4060-bf68-0dddde5feea1", "8768f089-f736-4346-a83d-e23fe05b0ecd","a76ff021-049c-4a1a-8372-02f9c51261d5", "8d5dc011-cbe2-4332-99cd-a1848ddfd65d","a2bccbdf-f0e9-489c-8513-011644cb5cf7", "89c79413-a7d1-462c-ab07-01f0835696f7","8d525daa-3697-455e-8f02-ab086cda7851", "c6f57c89-9871-4a92-9cbe-a2d76cd79cd0","19951134-97e1-4f62-8d5c-134077d1f955", "3202a063-4ebf-4f3f-a4b7-5e542307d726","40a0d872-45cc-46bc-b257-64ad898df281", "b891a528-4b5e-4ba7-949c-2a32cb5a75ec","0d46d52b-75a2-4df2-b363-43874c9503a2", "c1e4b8cf-0116-46bf-8dc9-55eb074ad315","6fd24ac6-1bb0-4ea6-a084-52cc22e9be42", "5f8780af-93e8-4907-9794-f8c960e87d34","692b1947-8b2e-45e4-8051-0319b7f0e438", "dde46f46-ff48-4763-9c50-377834ce7137" };return userID[random.nextInt(20)];}private static String ganerateItemID() {Random random = new Random();String[] ItemIDs = { "小米", "休闲鞋", "洗衣机", "显示器", "显卡", "洗衣液", "行车记录仪" };return ItemIDs[random.nextInt(7)];}private static String ganerateCityIDs() {Random random = new Random();String[] CityNames = { "上海", "北京", "深圳", "广州", "纽约", "伦敦", "东京", "首尔", "莫斯科", "巴黎" };return CityNames[random.nextInt(10)];}private static String ganerateDevice() {Random random = new Random();String[] Devices = { "android", "iphone", "ipad" };return Devices[random.nextInt(3)];}}


网站搜索排名Spark SQLUserlogsHot代码:  Spark SQLUserlogsHot.java

package com.dt.imf;import java.util.ArrayList;import java.util.Arrays;import java.util.Iterator;import java.util.List;import org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.function_return;import org.apache.hadoop.io.IntWritable;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.Function2;import org.apache.spark.api.java.function.PairFunction;import org.apache.spark.api.java.function.VoidFunction;import org.apache.spark.broadcast.Broadcast;import org.apache.spark.sql.DataFrame;import org.apache.spark.sql.Row;import org.apache.spark.sql.RowFactory;import org.apache.spark.sql.SQLContext;import org.apache.spark.sql.types.DataTypes;import org.apache.spark.sql.types.StructField;import org.apache.spark.sql.types.StructType;import org.apache.spark.sql.hive.HiveContext;import scala.Tuple2;public class Spark SQLUserlogsHot {public static void main(String[] args) {SparkConf conf = new SparkConf().setMaster("local").setAppName("Spark SQLUserlogsHottest");JavaSparkContext sc = new JavaSparkContext(conf);SQLContextsqlContext = new HiveContext(sc);JavaRDD<String> lines = sc.textFile("G:\\Spark SQLData\\Spark SQLUserlogsHot.log");String device = "iphone";final Broadcast<String>deviceBroadcast = sc.broadcast(device);JavaRDD<String>lineFilter = lines.filter(new Function<String, Boolean>() {@Overridepublic Boolean call(String s) throws Exception {return s.contains(deviceBroadcast.value());}}); // 组拼字符串(date#Item#userID) 构建KV(date#Item#userID,1)JavaPairRDD<String, Integer> pairs = lineFilter.mapToPair(new PairFunction<String, String, Integer>() {private static final long serialVersionUID = 1L;@Overridepublic Tuple2<String, Integer> call(String line) throws Exception {String[] splitedLine = line.split("\t");int one = 1;String dataanditemanduserid = splitedLine[0] + "#" + splitedLine[2] + "#"+ String.valueOf(splitedLine[1]);return new Tuple2<String, Integer>(String.valueOf(dataanditemanduserid), Integer.valueOf(one));}}); // reducebykey,统计计数JavaPairRDD<String, Integer>pairsCount = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {@Overridepublic Integer call(Integer v1, Integer v2) throws Exception {return v1 + v2;}});List<Tuple2<String, Integer>>pairsCountRows = pairsCount.collect();// 动态组拼出JSONList<String> userLogsInformations = new ArrayList<String>();for (Tuple2<String, Integer> row : pairsCountRows) {// 拆分三个字段String[] rowSplitedLine = row._1.split("#");String rowuserID = rowSplitedLine[2];String rowitemID = rowSplitedLine[1];String rowdateID = rowSplitedLine[0];// 拼接json元数据:Date、UserID、Item、City、DeviceString jsonZip = "{\"Date\":\"" + rowdateID + "\", \"UserID\":\"" + rowuserID + "\", \"Item\":\""+ rowitemID + "\", \"count\":" + row._2 + " }";userLogsInformations.add(jsonZip);} // 通过内容为JSON的RDD来构造DataFrameJavaRDD<String> userLogsInformationsRDD = sc.parallelize(userLogsInformations);DataFrame userLogsInformationsDF = sqlContext.read().json(userLogsInformationsRDD);userLogsInformationsDF.show();// 注册成为临时表userLogsInformationsDF.registerTempTable("userlogsInformations");/* * 使用子查询的方式完成目标数据的提取,在目标数据内幕使用窗口函数row_number来进行分组排序: PARTITION BY * :指定窗口函数分组的Key; ORDER BY:分组后进行排序; */String sqlText = "SELECT UserID,Item, count " + "FROM (" + "SELECT " + "UserID,Item, count,"+ "row_number() OVER (PARTITION BY UserID ORDER BY count DESC) rank" + " FROM userlogsInformations "+ ") sub_userlogsInformations " + "WHERE rank <= 3 ";System.out.println(sqlText);DataFrameuserLogsHotResultDF = sqlContext.sql(sqlText);userLogsHotResultDF.show();userLogsHotResultDF.write().format("json").save("G://Spark SQLData//Result15.json");while (true) {}}}


阅读全文
0 0
原创粉丝点击