频繁项集实际应用之分类到分类的交叉推荐

来源：互联网发布：省市区json数据 2016 编辑：程序博客网时间：2024/05/29 10:38

首先介绍下频繁项集的相关知识！其实频繁项集是针对购物车提出来的，也就是在购物车中频繁出现的物品的集合。
2.相关概念：
关联规则的支持度：Support(A,B)=包含A和B的事务数/事务总数
关联规则的置信度：Confidence(A,B)= 包含A和B的事务数/包含A事务数
频繁项集：项集的频率大于等于最小支持度。
强相关规则：同时满足最小支持度和最小置信度。
3.关联规则挖掘的步骤：

生成频繁项集，然后生成规则

空谈理论是没有实际意义的，本文基于敝人一个实际的工程项目，来介绍如何应用频繁项集进行关联规则推荐物品。基于商业秘密，所用到的数据均进行了处理！本工程通过Map-Reduce实现，由三个map-reduce过程来完成。

第一个map-reduce类:CrossRecommendStep1,生成频繁一项集,输入文件为order_wash.txt，这个文件在具体的项目中一般都是由用户的定单数据统计而来，具体的日志清洗与统计不在本文讨论范畴，其实一个完整的推荐流程是由日志清单、用户常购清单、推荐算法等多个步骤才能完成的，本文专注于交叉推荐算法的实际应用！order_wash.txt 格式如下：

accessTimemem_guidcategory2016-04-20 11:31:20FN05916CC204316,CC304119,CC4041152016-04-20 11:31:20FN05917CC204315,CC304111,CC4041152016-04-20 11:31:20FN05918CC204314,CC304112,CC4041152016-04-20 11:31:20FN05919CC204311,CC304113,CC4041172016-04-20 11:31:20FN05920CC204311,CC304115,CC4041162016-04-20 11:31:20FN05921CC204311,CC304115,CC404115

下面看CrossRecommendStep1类具本的实现代码，此类主要是统计每个分类下的购买次数（包括不同用户的，一个用户多次购买算多次，当然你也可以根据自己的业务逻辑来完成这个统计）最终的输出如下，只贴出来一部分

categorycountCC2043161CC2043113CC4041154

import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;/* * 交叉关联推荐：CC——CC * 用于订单成交后向上交叉推荐 * Step1:生成频繁1项集 * @author jianting.zhao * main函数就是驱动函数，固定的写法，in是输入文件路径，out是输出结果路径 *  *  */public class CrossRecommendStep1 {    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {        Configuration conf = new Configuration();        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();        if (otherArgs.length != 2) {            System.err.println("Usage: Data Deduplication <in> <out> ");            System.exit(2);        }        FileSystem fs = FileSystem.get(conf);        Path outPath = new Path(otherArgs[1]);        fs.deleteOnExit(outPath);        Job job = new Job(conf, "CrossRecommendStep1");        job.setJarByClass(CrossRecommendStep1.class);        job.setMapperClass(CrossRecommendStep1Map.class);        job.setReducerClass(CrossRecommendStep1Reduce.class);        //设置输出类型        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(Text.class);        job.setNumReduceTasks(10);                //设置输入及输出文件格式        job.setInputFormatClass(TextInputFormat.class);        job.setOutputFormatClass(TextOutputFormat.class);        //设置输入和输出目录        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));        FileOutputFormat.setOutputPath(job, outPath);        job.waitForCompletion(true);    }    public static class CrossRecommendStep1Map extends Mapper<Object, Text, Text, IntWritable> {        private final static IntWritable one = new IntWritable(1);        @Override        protected void map(Object key, Text value, Context context)                throws IOException, InterruptedException {            String[] line = value.toString().split("\t");            if (line.length == 3) {                String accessTime = line[0];                String mem_guid = line[1];                String category = line[2];                String[] temp = category.split(",");                for (int k = 0; k < temp.length; k++) {                    if (temp[k] == null) {                        continue;                    }                    context.write(new Text(temp[k]), one);                }            }        }    }    public static class CrossRecommendStep1Reduce extends Reducer<Text, IntWritable, Text, Text> {        @Override        protected void reduce(Text key, Iterable<IntWritable> value, Context context)                throws IOException, InterruptedException {            int sum = 0;            for (IntWritable n : value) {                sum += n.get();            }            context.write(key, new Text(String.valueOf(sum)));        }    }}

第二个map-reduce类:CrossRecommendStep2,生成频繁二项集，输入文件为order_wash.txt，和CrossRecommendStep1的输出,代码中有注释，

不再做另外的讲解

import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.text.DecimalFormat;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.filecache.DistributedCache;import org.apache.hadoop.fs.FileStatus;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;/** * 生成频繁2项集 *最终的输出格式是 *ccAccBsupportconfidence *CC204311CC20431480.8 * *support是支持度，是ccA=>ccB的总次数，confidence为置信度support/ccA总的购买次数 * @author jianting.zhao */public class CrossRecommendStep2 {    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {        Configuration conf = new Configuration();        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();        if (otherArgs.length != 3) {            System.err.println("Usage: Data Deduplication <in> <in> <out> ");            System.exit(2);        }        FileSystem fs = FileSystem.get(conf);        Path freqset1 = new Path(otherArgs[1]);        FileStatus[] user_stat = fs.listStatus(freqset1);        for (FileStatus f : user_stat) {//缓存上一步的输出            if (f.getPath().getName().indexOf("_SUCCESS") == -1 && f.isFile()) {                DistributedCache.addCacheFile(f.getPath().toUri(), conf);            }        }        if (fs.exists(new Path(otherArgs[2]))) {            fs.delete(new Path(otherArgs[2]),true);        }        Job job = new Job(conf, "CrossRecommendStep2");        job.setJarByClass(CrossRecommendStep2.class);        job.setMapperClass(CrossRecommendStep2Map.class);        job.setReducerClass(CrossRecommendStep2Reduce.class);        //设置输出类型        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(Text.class);        job.setNumReduceTasks(1);        //设置输入和输出目录        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));        FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));        job.waitForCompletion(true);    }    public static class CrossRecommendStep2Map extends Mapper<Object, Text, Text, IntWritable> {        HashMap<String, Integer> ctg_count = new HashMap<String, Integer>();        private static final IntWritable one = new IntWritable(1);        /*         * 加载频繁1项集         * 分类下购买次数大于10的构建频繁一项集,存储到HashMap<String, Integer> ctg_count中         */        protected void setup(Context context)                throws IOException, InterruptedException {            Configuration conf = context.getConfiguration();            Path[] file = DistributedCache.getLocalCacheFiles(conf);            FileSystem fs = FileSystem.getLocal(conf);            String line = null;            for (Path path : file) {                BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(path)));                while ((line = reader.readLine()) != null) {                    String[] tmp = line.split("\t");                    if (tmp.length != 2) {                        continue;                    }                    String ctg = tmp[0];                    int num = Integer.parseInt(tmp[1]);                    if (num >= 10) {                        ctg_count.put(ctg, num);                    }                }            }        }        /**         * 本函数的输出是原始文件，根据频繁一项集，生成ccA=>ccB和ccB=>ccA关联交叉关系         */        @Override        protected void map(Object key, Text value, Context context)                throws IOException, InterruptedException {            String[] line = value.toString().split("\t");            if (line.length == 3) {                String stg_set = line[2];                List<String> order_list = new ArrayList<String>();                String[] tmp = stg_set.split(",");                //剔除不满足频繁一项集的CC                for (int k = 0; k < tmp.length; k++) {                    if (tmp[k] == null) {                        continue;                    }                    if (ctg_count.get(tmp[k]) != null) {                        order_list.add(tmp[k]);                    }                }                //构建频繁2项集                for (int i = 0; i < order_list.size(); i++) {                    String ccA = order_list.get(i);                    for (int j = i + 1; j < order_list.size(); j++) {                        String ccB = order_list.get(j);                        context.write(new Text(ccA + ":" + ccB), one);                        context.write(new Text(ccB + ":" + ccA), one);                    }                }            }        }    }    public static class CrossRecommendStep2Reduce extends Reducer<Text, IntWritable, Text, Text> {        HashMap<String, Integer> ctg_count = new HashMap<String, Integer>();        DecimalFormat df = new DecimalFormat("0.00");        /*         * 加蒌频繁1项集         */        protected void setup(Context context)                throws IOException, InterruptedException {            Configuration conf = context.getConfiguration();            Path[] file = DistributedCache.getLocalCacheFiles(conf);            FileSystem fs = FileSystem.getLocal(conf);            String line = null;            for (Path path : file) {                BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(path)));                while ((line = reader.readLine()) != null) {                    String[] tmp = line.split("\t");                    if (tmp.length != 2) {                        continue;                    }                    String ctg = tmp[0];                    int num = Integer.parseInt(tmp[1]);                    if (num >= 10) {                        ctg_count.put(ctg, num);                    }                }            }        }        /*         * 计算2项集的支持度和置信度         */        @Override        protected void reduce(Text key, Iterable<IntWritable> value, Context context)                throws IOException, InterruptedException {            String[] line = key.toString().split(":");            String ccA = line[0];            String ccB = line[1];            int ccA_num = ctg_count.get(ccA);            int sum = 0;            for (IntWritable n : value) {                sum += n.get();            }            //支持度            int support = sum;            //置信度            double confidence = (double) sum / ccA_num;            StringBuffer sb = new StringBuffer();            sb.append(support).append("\t").append(df.format(confidence));            if (confidence > 0.0) {                context.write(new Text(ccA + "\t" + ccB), new Text(sb.toString()));            }        }    }}

第三个map-reduce类:CrossRecommendStep3,用常购清单给出推荐结果,致于常购清单的生成不在本文讨论范畴.

推荐的整本思想是，当ccA=>ccB高于某个阀值时，用ccB的常购商品作为ccA的推荐结果

import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.util.ArrayList;import java.util.HashMap;import java.util.Map;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.filecache.DistributedCache;import org.apache.hadoop.fs.FileStatus;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;/** * 根据关联类目，推荐常购清单商品 *此处推荐用到了分类下的常购清单 *推荐的整本思想是，当ccA=>ccB高于某个阀值时，用ccB的常购商品作为ccA的推荐结果 * @author jianting.zhao */public class CrossRecommendStep3 {    @SuppressWarnings("deprecation")    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {        Configuration conf = new Configuration();        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();        if (otherArgs.length != 3) {            System.err.println("Usage: Data Deduplication <in> <in> <out> ");            System.exit(2);        }        //加载常购清单        FileSystem fs = FileSystem.get(conf);        Path freqset1 = new Path(otherArgs[1]);        FileStatus[] user_stat = fs.listStatus(freqset1);        for (FileStatus f : user_stat) {            if (f.getPath().getName().indexOf("_SUCCESS") == -1) {                DistributedCache.addCacheFile(f.getPath().toUri(), conf);            }        }        if (fs.exists(new Path(otherArgs[2]))) {            fs.delete(new Path(otherArgs[2]),true);        }        Job job = new Job(conf, "CrossRecommendStep3");        job.setJarByClass(CrossRecommendStep3.class);        job.setMapperClass(CrossRecommendStep3Map.class);        job.setReducerClass(CrossRecommendStep3Reduce.class);        //设置输出类型        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(Text.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(Text.class);        job.setNumReduceTasks(1);        //设置输入和输出目录        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));        FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));        job.waitForCompletion(true);    }    public static class CrossRecommendStep3Map extends Mapper<Object, Text, Text, Text> {        /*         * 加载频繁二项集         */        @Override        protected void map(Object key, Text value, Context context)                throws IOException, InterruptedException {        String[] category = key.toString().split("\t");            String[] line = value.toString().split("\t");            if (category.length == 2) {                String ctg1 = category[0];                String ctg_rec = category[1];                /*                 * 这段代码是限制关联度高于多少时才用来推荐                 */                /*double support = Double.parseDouble(line[0]);                double confidence = Double.parseDouble(line[0]);                if(support < 100 || confidence< 0.8){                return;                }                */                if (ctg1 != null && ctg_rec != null) {                    context.write(new Text(ctg1), new Text(ctg_rec));                    context.write(new Text(ctg_rec), new Text(ctg1));                }            }        }    }    public static class CrossRecommendStep3Reduce extends Reducer<Text, Text, Text, Text> {        Map<String, ArrayList<String>> ctg_often_items = new HashMap<String, ArrayList<String>>();        /*         *加载常购清单         */        protected void setup(Context context)                throws IOException, InterruptedException {            Configuration conf = context.getConfiguration();            Path[] file = DistributedCache.getLocalCacheFiles(conf);            FileSystem fs = FileSystem.getLocal(conf);            String line = null;            for (Path path : file) {                BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(path)));                while ((line = reader.readLine()) != null) {                    String[] tmp = line.split("\t");                    if (tmp.length != 3) {                        continue;                    }                    String ctg = tmp[0];                    String item_id = tmp[1];                    ArrayList<String> often_items = ctg_often_items.get(ctg);                    if (often_items == null || often_items.isEmpty()) {                        often_items = new ArrayList<String>();                        ctg_often_items.put(ctg, often_items);                    }                    if (often_items.size() < 20) {                        often_items.add(item_id);                    }                }            }        }        @Override        protected void reduce(Text key, Iterable<Text> value, Context context)                throws IOException, InterruptedException {            ArrayList<String> recommend_list = new ArrayList<String>();            //用常购清单商品给出推荐            if (recommend_list.size() < 50) {                int completion_num = 50 - recommend_list.size();                int temp = 0;                int size = 0;                while (completion_num > 0 && temp <= size) {                    if (recommend_list.size() > 50) {                        break;                    }                    for (Text val : value) {                        if (recommend_list.size() > 50) {                            break;                        }                        String rec_ctg = val.toString();                        if (ctg_often_items.get(rec_ctg) == null) {                            continue;                        }                        ArrayList<String> offen_items = ctg_often_items.get(rec_ctg);                        size = offen_items.size() - 1;                        if (temp >= offen_items.size()) {                            continue;                        }                        String comple_itemid = offen_items.get(temp);                        if (!recommend_list.contains(comple_itemid)) {                            recommend_list.add(comple_itemid);                            completion_num--;                        }                    }                    temp++;                }            }            if (recommend_list.size() > 0) {                context.write(key, new Text(recommend_list.toString()));            }        }    }}

阅读全文

1 0