利用艺术家的整数ID映射将标签转换为向量

来源:互联网 发布:武汉ui培训知乎 编辑:程序博客网 时间:2024/06/07 01:07
<strong><span style="font-size:18px;">/*** * @author YangXin * @info Mapper选择艺术家的整数特征ID然后建立单个特征的向量。这些一维的部分 * 向量会传给Reducer,后者会将这些向量简单地进行联结,生成一个完整的向量。 */package unitTwelve;import java.io.IOException;import java.util.HashMap;import java.util.Map;import java.util.regex.Pattern;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.io.DefaultStringifier;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.util.GenericsUtil;import org.apache.mahout.math.NamedVector;import org.apache.mahout.math.SequentialAccessSparseVector;import org.apache.mahout.math.VectorWritable;public class VectorMapper extends Mapper<LongWritable, Text, Text, VectorWritable>{private Pattern splitter;private VectorWritable writer;private Map<String, Integer> dictionary = new HashMap<String, Integer>();@Overrideprotected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{String[] fields = splitter.split(value.toString());if(fields.length < 4){context.getCounter("Map", "LinesWithErrors").increment(1);return;}String arrtist = fields[1];String tag = fields[2];double weight = Double.parseDouble(fields[3]);NamedVector vector = new NamedVector(new SequentialAccessSparseVector(dictionary.size()), tag);vector.set(dictionary.get(value), weight);writer.set(vector);context.write(new Text(tag), writer);}@Overrideprotected void setup(Context context) throws IOException, InterruptedException{super.setup(context);Configuration conf = context.getConfiguration();DefaultStringifier<Map<String, Integer>> mapStringifier = new DefaultStringifier<Map<String, Integer>>(conf, GenericsUtil.getClass(dictionary));dictionary = mapStringifier.fromString(conf.get("dictionary"));splitter = Pattern.compile("<sep>");writer = new VectorWritable();}}</span></strong>

0 0
原创粉丝点击