level 1
最爱银时男
楼主
用的是python连接spark
import sys
import os
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import HiveContext
from pyspark import SQLContext
import pandas as pd
os.environ["SPARK_HOME"]='/opt/hadoopclient/Spark/spark/'
sys.path.append('/opt/hadoopclient/Spark/spark/python')
string_test='pyspark_test'
conf=SparkConf().setAppName(string_test).setMaster('local[*]')
sc=SparkContext(conf=conf)
sparksql=HiveContext(sc)
sparksql.sql("drop table if exists sfz616_result1")
sparksql.sql("drop table if exists sfz616_result2")
第一个表:
sparksql.sql("create table sfz616_result1 as \
select a.zjhm,a.fcrq,a.hcpcc,a.hcpcfd,a.hcpmdd from bigdata.gd_tl_gtsmzxx a,sfz_616 b where a.zjhm=b.sfz")
第二个表:
sparksql.sql("create table sfz616_result2 as select zjhm,max(fcrq) from sfz616_result1 group by zjhm")
sparksql.sql("create table sfz616_result3 as select distinct t1.* from sfz616_result1 t1,sfz616_result2 t2 where t1.zjhm=t2.zjhm and t1.fcrq=t2.fcrq2")
df=sparksql.sql("select * from sfz616_result2")
df1=df.toPandas()
df1.to_excel("./2.xlsx")
很简单的关联去重语句,建第一个表没问题,就是死活建不了第二个表
2019年10月19日 16点10分
1
import sys
import os
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import HiveContext
from pyspark import SQLContext
import pandas as pd
os.environ["SPARK_HOME"]='/opt/hadoopclient/Spark/spark/'
sys.path.append('/opt/hadoopclient/Spark/spark/python')
string_test='pyspark_test'
conf=SparkConf().setAppName(string_test).setMaster('local[*]')
sc=SparkContext(conf=conf)
sparksql=HiveContext(sc)
sparksql.sql("drop table if exists sfz616_result1")
sparksql.sql("drop table if exists sfz616_result2")
第一个表:
sparksql.sql("create table sfz616_result1 as \
select a.zjhm,a.fcrq,a.hcpcc,a.hcpcfd,a.hcpmdd from bigdata.gd_tl_gtsmzxx a,sfz_616 b where a.zjhm=b.sfz")
第二个表:
sparksql.sql("create table sfz616_result2 as select zjhm,max(fcrq) from sfz616_result1 group by zjhm")
sparksql.sql("create table sfz616_result3 as select distinct t1.* from sfz616_result1 t1,sfz616_result2 t2 where t1.zjhm=t2.zjhm and t1.fcrq=t2.fcrq2")
df=sparksql.sql("select * from sfz616_result2")
df1=df.toPandas()
df1.to_excel("./2.xlsx")
很简单的关联去重语句,建第一个表没问题,就是死活建不了第二个表