Hadoop好友推荐系统-用户距离计算

来源：互联网发布：少女时代知乎话题编辑：程序博客网时间：2024/06/09 11:41

项目总目录：基于Hadoop的好友推荐系统项目综述

一、用户距离计算

1、前端展示

jsp页面

    <table>            <tr>                <td><label for="name">输入路径:</label>                </td>                <td><input class="easyui-validatebox" type="text"                    id="caldistance_input_id" data-options="required:true" style="width:300px"                    value="/user/root/_filter/preparevectors" /></td>            </tr>            <tr>                <td><label for="name">输出路径:</label>                </td>                <td><input class="easyui-validatebox" type="text"                    id="caldistance_output_id" data-options="required:true" style="width:300px"                    value="/user/root/_filter/caldistance" /></td>            </tr>            <tr>                <td></td>                <td><a id="caldistance_submit_id" href="" class="easyui-linkbutton"                    data-options="iconCls:'icon-door_in'">计算距离</a></td>            </tr>        </table>

默认输入路径是/user/root/_filter/preparevectors，也就是数据库过滤文件到HDFS的输出目录；默认输出路径是/user/root/_filter/caldistance。

js逻辑

//caldistance_submit_id 计算向量之间的距离    $('#caldistance_submit_id').bind('click', function(){        var input_=$('#caldistance_input_id').val();        var output_=$('#caldistance_output_id').val();        // 弹出进度框        popupProgressbar('请等待','提交计算向量距离任务中...',1000);        // ajax 异步提交任务        callByAJax('cloud/cloud_caldistance.action',{input:input_,output:output_});    });

2、后端逻辑

action层

对应的action从这里获取callByAJax(‘cloud/cloud_caldistance.action’,）。

/**     * 计算向量之间的距离     */    public void caldistance(){        Map<String ,Object> map = new HashMap<String,Object>();        try{            HUtils.setJobStartTime(System.currentTimeMillis()-2000);//设置启动时间            HUtils.JOBNUM=1;            new Thread(new CalDistance(input,output)).start();//启动线程执行MapReduce任务            map.put("flag", "true");            map.put("monitor", "true");//打开任务监控页面        } catch (Exception e) {            e.printStackTrace();            map.put("flag", "false");            map.put("monitor", "false");            map.put("msg", e.getMessage());        }        Utils.write2PrintWriter(JSON.toJSONString(map));    }

CalDistance类定义如下：

/** * 计算距离 */public class CalDistance implements Runnable {    private String input;    private String output;    public CalDistance(String input,String output){        this.input=input;        this.output=output;    }    @Override    public void run() {        String [] args ={                HUtils.getHDFSPath(input),                HUtils.getHDFSPath(output)        };        try {            ToolRunner.run(HUtils.getConf(), new CalDistanceJob(),args );        } catch (Exception e) {            e.printStackTrace();        }    }    public String getInput() {        return input;    }    public void setInput(String input) {        this.input = input;    }    public String getOutput() {        return output;    }    public void setOutput(String output) {        this.output = output;    }}

MapReduce任务

CalDistanceJob的定义如下：

/** * 计算记录两两之间的距离 * map输出<DoubleWritable,IntPairWritable> * reduce 输出<doubleWritable,IntPairWritable> *             距离，<样本id，样本id> *  */public class CalDistanceJob extends Configured implements Tool {    @Override    public int run(String[] args) throws Exception {        Configuration conf = HUtils.getConf();        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();        if (otherArgs.length !=2) {          System.err.println("Usage: com.kang.filter.CalDistanceJob <in> <out>");          System.exit(2);        }        conf.set("INPUT", otherArgs[0]);        Job job =  Job.getInstance(conf,"calculate vectors  from  input  :"+                otherArgs[0]+" to "+otherArgs[1]);        job.setJarByClass(CalDistanceJob.class);        job.setMapperClass(CalDistanceMapper.class);        job.setReducerClass(CalDistanceReducer.class);        job.setNumReduceTasks(1);        job.setMapOutputKeyClass(DoubleWritable.class   );        job.setMapOutputValueClass(IntPairWritable.class);        job.setOutputKeyClass(DoubleWritable.class);        job.setOutputValueClass(IntPairWritable.class);        job.setOutputFormatClass(SequenceFileOutputFormat.class);//SequenceFileOutputFormat用于二进制文件（序列化文件）的处理        job.setInputFormatClass(SequenceFileInputFormat.class);        SequenceFileInputFormat.addInputPath(job, new Path(otherArgs[0]));        SequenceFileOutputFormat.setOutputPath(job,new Path(otherArgs[1]));        FileSystem.get(conf).delete(new Path(otherArgs[1]), true);        int ret =job.waitForCompletion(true) ? 0 : 1;        long records=job.getCounters().findCounter(FilterCounter.REDUCE_COUNTER)                .getValue();        Utils.simpleLog("距离计算后的总记录数："+records);        HUtils.INPUT_RECORDS=records;        return ret;    }}

MapReduce任务的具体实现

map方法的实现：

/** * 计算用户距离的mapper方法 */public class CalDistanceMapper extends Mapper<IntWritable,DoubleArrIntWritable, DoubleWritable, IntPairWritable> {    private Logger log = LoggerFactory.getLogger(CalDistanceMapper.class);    private Path input;    private DoubleWritable newKey= new DoubleWritable();    private IntPairWritable newValue= new IntPairWritable();    @Override     public void setup(Context cxt){        input=new Path(cxt.getConfiguration().get("INPUT"));//     }    @Override    public void map(IntWritable key,DoubleArrIntWritable  value,Context cxt)throws InterruptedException,IOException{        cxt.getCounter(FilterCounter.MAP_COUNTER).increment(1L);        if(cxt.getCounter(FilterCounter.MAP_COUNTER).getValue()%3000==0){            //每处理3000条数据，输出两条日志信息            log.info("Map处理了{}条记录...",cxt.getCounter(FilterCounter.MAP_COUNTER).getValue());            log.info("Map生成了{}条记录...",cxt.getCounter(FilterCounter.MAP_OUT_COUNTER).getValue());        }        Configuration conf = cxt.getConfiguration();        SequenceFile.Reader reader = null;        FileStatus[] fss=input.getFileSystem(conf).listStatus(input);//获取输入路径下的文件列表        for(FileStatus f:fss){//对于当前的一条用户数据来说，需要循环所有文件读取所有记录，根据条件计算两者距离。            if(!f.toString().contains("part")){                continue; // 排除其他文件，只读取文件名包含“part”的文件（结果文件）            }            try {                reader = new SequenceFile.Reader(conf, Reader.file(f.getPath()),                        Reader.bufferSize(4096), Reader.start(0));                IntWritable dKey = (IntWritable) ReflectionUtils.newInstance(                        reader.getKeyClass(), conf);                DoubleArrIntWritable dVal = (DoubleArrIntWritable) ReflectionUtils.newInstance(                        reader.getValueClass(), conf);                while (reader.next(dKey, dVal)) {// 遍历当前整个文件                    // 为了避免重复计算（i与j的距离和j与i的距离计算一个即可），                    //对于特定的用户i（i是用户id，也就是key.get()），只有当遍历到的用户id>i时（也就是key.get()<dKey.get()），才计算两者距离                    if(key.get()<dKey.get()){                        cxt.getCounter(FilterCounter.MAP_OUT_COUNTER).increment(1L);                        double dis= HUtils.getDistance(value.getDoubleArr(), dVal.getDoubleArr());                        newKey.set(dis);                        newValue.setValue(key.get(), dKey.get());                        cxt.write(newKey, newValue);                    }                }            } catch (Exception e) {                e.printStackTrace();            } finally {                IOUtils.closeStream(reader);            }        }    }}

map端的输出是<dist，（id1，id2）>的形式，其中dist表示用户距离，id1，id2表示用户的id，且保证id1<id2。
其中HUtils.getDistance的定义如下：

/**     * 计算两个向量之间的距离，这里使用欧式距离     *      * @param inputI     * @param ds     * @return     */    public static double getDistance(double[] inputI, double[] ds) {//向量求距离        double error = 0.0;        for (int i = 0; i < inputI.length; i++) {            error += (inputI[i] - ds[i]) * (inputI[i] - ds[i]);        }        return Math.sqrt(error);//开方求欧式距离    }

reduce方法实现如下：

/** * 计算向量距离reduce方法 */public class CalDistanceReducer extends        Reducer<DoubleWritable, IntPairWritable, DoubleWritable, IntPairWritable> {    public void reduce(DoubleWritable key,Iterable<IntPairWritable> values,Context cxt)throws InterruptedException,IOException{        for(IntPairWritable v:values){            cxt.getCounter(FilterCounter.REDUCE_COUNTER).increment(1);            cxt.write(key, v);        }    }}

reduce端直接把map端的输出写入到HDFS上。格式仍然是<dist，（id1，id2）>的形式。

二、程序运行截图

1、前端

这里写图片描述

2、后台

这里写图片描述

3、HDFS目录

这里写图片描述

阅读全文

0 0