We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 2
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg from pyspark.sql.functions import *
Initialising a SparkSession findspark.init()
from pyspark.sql import SparkSession spark = SparkSession.builder.appName("App01").getOrCreate() Reading data from a file df = spark.read.format('csv').option('header', True).load('fpath.csv') Loading/Creating DataFrame from CSV df = spark.read.csv(fpath, header=True) Loading/Creating DataFrame from JSON df = spark.read.json(fpath) Loading/Creating DataFrame from Parquet df = spark.read.parquet(fpath) Loading/Creating DataFrame from RDDs df = rdd1.toDF(["ID", "Name"]) Creating DataFrame from data data = [(1, "Alice", 28), (2, "Bob", 22)] columns = ["ID", "Name", "Age"] df = spark.createDataFrame(data, columns) Get Dataframe data count df.count() Get Dataframe columns df.columns Get Dataframe Schema df.printSchema() Displaying Data df.show() Filtering data by Columns df1 = df.select("Name", "Age") Filtering data by Elements using filter() df1 = df.filter(df.Age > 18) Filtering data by Elements using expr() df1 = df.filter(expr("Age > 18")) Filtering data by Elements using where() df1 = df.where(df.Age > 18) Filtering data by Elements using SQL-like syntax df1 = df.where("Age > 18") Aggregating data from pyspark.sql.functions import avg df_aggregated = df.groupBy('colname1').agg(avg('colname2')) Joining data df_joined = df1.join(df2, df.column_name == df2.column_name) df_joined = df1.join(df2, join_condition, "full") df_joined = df1.join(df2, join_condition, "left") df_joined = df1.join(df2, join_condition, "right") df_joined = df1.join(df2, join_condition, "inner") Column Adding dfnew = df.withcolumn("City", lit("Bangalore")) Column Renaming dfnew = df.withcolumnRenamed("Name", "Full_Name") Column Dropping dfnew = df.drop("Age") Transform data using Functions df1 = df.withColumn("Status", when(col("Age")>25, "Adault").otherwise("Young")) Transform data using Expressions df1 = df.withColumn("ID_Name", concat(col("ID", lit("_"), col("Name"))) Transform data using SQL-like expr df1 = df.withColumn("AgeGroup", expr("CASE WITH Age<=25 THEN 'Young ELSE 'Adult' END"))) Writing data to a file df.write.format('csv').option('header', True).save('file_path.csv')
Aggregation using Sum, Avg, Count, Min, Max df1 = df.select(avg("Salary")).colect()[0][0]
Grouping data elements df_aggregated = df.groupBy('colname1').agg(avg('colname2')) Handling Missing data Dropping Rows with Null Values df.dropna() Filling Null Values df.fillna({'Age': df.select(avg('Age').first()[0]}) Handline Categorical Missing Data df.fillna({'Gender':'Unknown'}) Adding an Indicator Column df.withColumn('Age_missing', df['age'].isNull())
Date and Time Stamp from pyspark.sql.functions import current_date, current_timestamp
Current Date df.withColumn("CurrentDate", current_date()) Current Time Stamp df.withColumn("CurrentTimestamp", current_timestamp()) Date Difference df.withColumn("DaysSince", datediff(current_date(), col("Date"))) Month Between df.withColumn("MonthBetween", months_between(current_date(), col("Date"))) Date Addition df.withColumn("Plus10Days", date_add(col("Date"), 10)) Date Subtraction df.withColumn("Minus5Days", date_sub(col("Date"), 5))
[Ebooks PDF] download (Ebook) Sams Teach Yourself XML in 21 Days (2nd Edition) by Devan Shepherd ISBN 9780672320934, 9780768657968, 0672320932, 0768657962 full chapters