The following sample code (by Python and C#) shows how to read CSV file without schema.
Data File
- Create a csv file with the following content.
product,price
Milk,3.99
Bread,4.5
Bread,4.25
Egg,2.99
Milk,4.3
Egg,3.49
Bread,4.15
Egg,3.75
Milk,3.89
Egg,3.15
Result
Python Application
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
if __name__ == "__main__":
spark = (SparkSession
.builder
.appName("grocery-price")
.getOrCreate())
data_file = "test/data/grocery-price.csv"
grocery_price = (spark.read.format("csv")
.option("header", True)
.option("inferSchema", True)
.load(data_file))
grocery_price.show()
# group by product and get the average price
average_price = (grocery_price
.groupBy("product")
.agg(avg("price")))
average_price.show()
spark.stop()
C# Application
using Microsoft.Spark.Sql;
using static Microsoft.Spark.Sql.Functions;
namespace MySpark.Examples
{
internal class GroceryPrice
{
public static void Run()
{
SparkSession spark =
SparkSession
.Builder()
.AppName("grocery-price")
.GetOrCreate();
string filePath = "data/grocery-price.csv";
// initial data frame
DataFrame groceryPrice = spark.Read()
.Format("csv")
.Option("header", true)
.Option("inferSchema", true)
.Load(filePath);
groceryPrice.Show();
// Group by product and Get the average price
DataFrame averagePrice =
groceryPrice.GroupBy("product")
.Agg(Avg("price"));
averagePrice.Show();
spark.Stop();
}
}
}