Skip to content

Commit

Permalink
pyspark partitionby
Browse files Browse the repository at this point in the history
  • Loading branch information
sparkcodegeeks committed Mar 7, 2021
1 parent c00a13b commit 6aeb9e8
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 0 deletions.
61 changes: 61 additions & 0 deletions pyspark-partitionby.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""

from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()

#df=spark.read.option("header",True) \
# .csv("C:/apps/sparkbyexamples/src/pyspark-examples/resources/simple-zipcodes.csv")
df=spark.read.option("header",True) \
.csv("C:/apps/sparkbyexamples/src/spark-scala-examples-new/src/main/resources/free-zipcode-database.csv")

df.show()
print(df.rdd.getNumPartitions())

df.write.option("header",True) \
.partitionBy("state") \
.mode("overwrite") \
.csv("c:/tmp/zipcodes-state")

df.write.option("header",True) \
.partitionBy("state","city") \
.mode("overwrite") \
.csv("c:/tmp/zipcodes-state-city")


df=df.repartition(2)

print(df.rdd.getNumPartitions())

df.write.option("header",True) \
.partitionBy("state") \
.mode("overwrite") \
.csv("c:/tmp/zipcodes-state-more")

dfPartition=spark.read.option("header",True)\
.csv("c:/tmp/zipcodes-state")
dfPartition.printSchema()

dfSinglePart=spark.read.option("header",True) \
.csv("c:/tmp/zipcodes-state/state=AL/city=SPRINGVILLE")
dfSinglePart.printSchema()
dfSinglePart.show()

parqDF = spark.read.option("header",True) \
.csv("c:/tmp/zipcodes-state")
parqDF.createOrReplaceTempView("ZIPCODE")
spark.sql("select * from ZIPCODE where state='AL' and city = 'SPRINGVILLE'") \
.show()

df.write.option("header",True) \
.option("maxRecordsPerFile", 2) \
.partitionBy("state") \
.mode("overwrite") \
.csv("/tmp/zipcodes-state-maxrecords")

21 changes: 21 additions & 0 deletions resources/simple-zipcodes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
RecordNumber,Country,City,Zipcode,State
1,US,PARC PARQUE,704,PR
2,US,PASEO COSTA DEL SUR,704,PR
10,US,BDA SAN LUIS,709,PR
61391,US,CINGULAR WIRELESS,76166,TX
61392,US,FORT WORTH,76177,TX
61393,US,FT WORTH,76177,TX
4,US,URB EUGENE RICE,704,PR
39827,US,MESA,85209,AZ
39828,US,MESA,85210,AZ
49345,US,HILLIARD,32046,FL
49346,US,HOLDER,34445,FL
49347,US,HOLT,32564,FL
49348,US,HOMOSASSA,34487,FL
3,US,SECT LANAUSSE,704,PR
54354,US,SPRING GARDEN,36275,AL
54355,US,SPRINGVILLE,35146,AL
54356,US,SPRUCE PINE,35585,AL
76511,US,ASH HILL,27007,NC
76512,US,ASHEBORO,27203,NC
76513,US,ASHEBORO,27204,NC

0 comments on commit 6aeb9e8

Please sign in to comment.