spark加载中文乱码

来源:互联网 发布:留学监理网靠谱吗 知乎 编辑:程序博客网 时间:2024/05/22 07:59

spark加载中文乱码


在有些时候,spark加载HDFS或者本地文件中的中文时候会出现乱码的现象,因此需要在加载的时候进行一些字符的编码设置,将加载的文件的编码格式设置为UTF-8

  • 以下是现有的解决方案和案例(直接读取hadoop file)

val rdd = sc.hadoopFile(file_path, classOf[TextInputFormat],      classOf[LongWritable], classOf[Text]).map(      pair => new String(pair._2.getBytes, 0, pair._2.getLength, "GBK"))

注:其中file_path为你所读取的文件的路径, classOf[TextInputFormat], classOf[LongWritable],classOf[Text]这些都是spark中包含的一些类,因此在头文件中应当import以下文件:

import org.apache.hadoop.io.{LongWritable, Text}import org.apache.hadoop.mapred.TextInputFormat

案例如下:

  • 以下代码是加载数据包含中文数据求两列的时间差并转换成秒
  • 程序数据如下:
2017-7-1 0:0:46,00,BS01387D,BS01387D,0E232,0E232,1,0,113.994522,22.691019,0.000000,17-6-30 8:45:18,18.000000,278.700012,0.000000,43925.2890622017-7-1 0:0:46,00,BS06385D,BS06385D,09150,09150,1,0,114.339653,22.707256,0.000000,17-7-1 0:0:44,0.000000,255.800003,0.000000,11011.0595702017-7-1 0:0:46,00,BS01786D,BS01786D,4526,4526,1,0,114.465042,22.598454,0.000000,17-6-30 8:45:17,0.000000,308.200012,0.000000,10158.2500002017-7-1 0:0:46,01,BS06988D,BS06988D,M2823,M2823,1,0,114.061539,22.373081,0.000000,17-6-30 8:49:24,27.000000,332.899994,0.000000,29766.679688,F_bT0059,0-0-0 0:0:0,F_bT00602017-7-1 0:0:46,01,粤BU0010,粤BU0010,M3223,M3223,1,0,114.212997,22.693300,0.000000,17-6-30 8:45:22,31.000000,131.000000,31.000000,0.000000,FHG0313,17-6-30 8:45:22,FHG01402017-7-1 0:0:46,00,BS03448D,BS03448D,M4633,M4633,1,0,114.107643,22.592354,0.000000,17-6-30 8:49:25,0.000000,3.800000,0.000000,10226.2304692017-7-1 0:0:46,00,BS01681D,BS01681D,09060,09060,1,0,114.199982,22.652081,0.000000,17-6-30 8:45:15,23.000000,148.399994,0.000000,16442.6093752017-7-1 0:0:46,00,BS06973D,BS06973D,M2663,M2663,1,0,114.259277,22.723864,0.000000,17-6-30 8:49:23,0.000000,34.099998,0.000000,50952.2500002017-7-1 0:0:46,00,BS34389D,BS34389D,03110,03110,1,0,113.873886,22.572926,0.000000,17-6-30 8:45:17,0.000000,124.800003,0.000000,8244.1103522017-7-1 0:0:46,00,粤BCC313,粤BCC313,03130,03130,1,0,114.098396,22.702860,0.000000,17-6-30 8:49:22,0.000000,98.000000,0.000000,45027.6914062017-7-1 0:0:46,00,BS06189D,BS06189D,09780,09780,1,0,114.365410,22.737383,0.000000,17-6-30 8:45:16,26.000000,73.400002,0.000000,12547.5302732017-7-1 0:0:46,00,BS06762D,BS06762D,03690,03690,1,0,113.919914,22.533268,0.000000,17-6-30 8:49:22,0.000000,184.699997,0.000000,15405.4199222017-7-1 
  • 以下是程序源码:
import java.text.{DecimalFormat, SimpleDateFormat}import java.util.Dateimport org.apache.hadoop.io.{LongWritable, Text}import org.apache.hadoop.mapred.TextInputFormatimport org.apache.spark.{SparkConf, SparkContext}//spark-submit --class analyze.analyze BusDataAnalyze-1.0-SNAPSHOT-jar-with-dependencies.jar /tmp/STRING_20170626/**  * Created by User on 2017/7/3.  */object analyze {  def getCoreTime(start_time: String, end_Time: String) = {    var df: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")    var begin: Date = df.parse(start_time)    var end: Date = df.parse(end_Time)    var between: Long = (begin.getTime() - end.getTime()) / 1000 //转化成秒    between  }  def timeStrFormat(time_str: String): String = {    var df: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")    var date: Date = df.parse(time_str)    var result = df.format(date)    result  }  def main(args: Array[String]): Unit = {    val pps = System.getProperties();    var time_str1 = ""    var time_str2 = ""    var df: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")    pps.setProperty("file.encoding", "UTF-8")    var file_path = ""    if (args.length > 0) {      file_path = args(0).toString    } else {      file_path = "E:\\sample_bus.txt"    }    //val sparkConf = new SparkConf().setAppName("Spark-Custom-Partitioner").setMaster("spark://hadoop-1:7077")    //val sparkConf = new SparkConf().setAppName("bus_analyze").setMaster("spark://hadoop-1:7077")    val sparkConf = new SparkConf().setAppName("bus_analyze").setMaster("local[2]")    val sc = new SparkContext(sparkConf)    val rdd = sc.hadoopFile(file_path, classOf[TextInputFormat],      classOf[LongWritable], classOf[Text]).map(      pair => new String(pair._2.getBytes, 0, pair._2.getLength, "GBK"))    System.out.println("read data successfully")    val filter_length = rdd.filter(_.split(",").length > 10).filter(x => {      var flag = true      val items = x.split(",")      try {        time_str1=timeStrFormat(items(0)).substring(0,10)        time_str2=timeStrFormat("20"+items(11)).substring(0,10)      } catch {        case e: Exception => {          println("cdr parse timestamp wrong")          flag = false        }          if (!time_str1.equals(time_str2)) {            flag = false          }      }      flag    })    filter_length.foreach(println)    System.out.println("after filter ......")    System.err.println("error occor ")    val time_format = filter_length.map(x => {      val items = x.split(",")      val time_str1 = timeStrFormat(items(0))      val time_str2 = timeStrFormat("20" + items(11))      val time_between = getCoreTime(time_str1, time_str2)      (items(4), time_between)    }).filter(_._2 > 2)   // time_format.foreach(println)    val time_format_specail=time_format.filter(x=>{      var result=true      if(x._1.contains("M385")||x._1.contains("M228"))        result=true      else        result=false      result    })    time_format.foreach(x=>{System.out.println(x)})   // time_format.saveAsTextFile("/user/gongguiwei/bus_analyze/" + file_name + "/")    System.out.println("over soon")    Thread.sleep(120*1000)   // time_format_specail.saveAsTextFile("/user/gongguiwei/bus_analyze/" + file_name + "specail/")  }
原创粉丝点击