spark + psycopg2+postgre 实现insert update 功能

来源:互联网 发布:淘宝自动优化标题软件 编辑:程序博客网 时间:2024/06/08 02:40

spark sql 主要是想了select 功能,不具备insert, update 功能,本文实现用spark + psycopg2对postgre 数据库进行insert update计算,代码部分已进行了详细的说明:

import psycopg2
import psycopg2.extras

from pyspark import SparkContext,SparkConf 
from pyspark.sql import SQLContext 
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType 


#opera: 对rdd中每一条记录进行操作
def opera(x):
    newval = x[2]+ x[3]
    return Row(id=x[0], name=x[1], val=newval)


#save_df_to_db:对rdd中的每一个partition进行处理
def save_df_to_db(records):


    db_conn = psycopg2.connect(database="testdb", user="postgres", password="password", host="127.0.0.1", port="5432")
    dict_cursor=db_conn.cursor()
    upsert_query = "INSERT INTO t3 (id, name, val) VALUES (%(id)s, %(name)s, %(val)s) ON CONFLICT (id) DO UPDATE SET name = EXCLUDED.name,val=EXCLUDED.val;"


    try:
         dict_cursor.executemany(upsert_query, records)
    except Exception as e:
        print "Error in executing save_df_to_db: ", e.message


    db_conn.commit()
    db_conn.close()
 
#从数据库中获取需要的数据
conn = psycopg2.connect(database="testdb", user="postgres", password="password", host="127.0.0.1", port="5432")
cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
cursor.execute("SELECT t1.id , t2.name, t1.income, t2.val FROM t1 inner join t2 on t1.id = t2.id")
rows = cursor.fetchall()


#spark入口
sc = SparkContext(appName="PythonSQL")


#将数据库中的结果转化为rdd, 以进行后续操作
rdd = sc.parallelize(rows)


#数据处理
rdd2 = rdd.map(opera)


#写入数据库
rdd2.foreachPartition(save_df_to_db)


conn.close()

阅读全文
0 0