The following sample code (by Python and C#) shows how to count the word in a text file.
Data File
- Create a text file with any content.
Genesis 1
The Beginning
In the beginning God created the heavens and the earth.
Now the earth was formless and empty, darkness was over the surface of the deep, and the Spirit of God was hovering over the waters.
And God said, Let there be light, and there was light.
God saw that the light was good, and he separated the light from the darkness.
God called the light day, and the darkness he called night.
And there was evening, and there was morning—the first day.
Result
Python Application
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
if __name__ == "__main__":
# create a Spark Session
spark = (SparkSession
.builder
.appName("word_count")
.getOrCreate())
# read the data file - row by row
data_file = "test/data/genesis.txt"
data_frame = spark.read.text(data_file)
data_frame.show(n=5)
# split into words -- space separated
words = data_frame.select(split(col("value"), " ").alias("words"))
words.show(n=5)
# expand the array of words
word_list = words.select(explode(col("words")).alias("word"))
word_list.show(n=10)
# group by word, and count the occurrences
word_count = word_list.groupBy("word").count()
word_count.show(n=10)
# sort by count
sorted_word_count = word_count.orderBy(col("count").desc())
sorted_word_count.show(n=10)
# create word counts in a single statement
result = (data_frame
.select(split(col("value"), " ").alias("words"))
.select(explode(col("words")).alias("word"))
.groupBy("word")
.count()
.orderBy(col("count").desc()))
result.show(n=10)
spark.stop()
C# Application
using Microsoft.Spark.Sql;
using static Microsoft.Spark.Sql.Functions;
namespace MySpark.Examples
{
internal class WordCount
{
static public void Run()
{
// Create Spark session
SparkSession spark =
SparkSession
.Builder()
.AppName("word_count")
.GetOrCreate();
// Create initial DataFrame
string dataFile = "data/genesis.txt";
DataFrame dataFrame = spark.Read().Text(dataFile);
dataFrame.Show(5);
// Split into words -- space separated
DataFrame words = dataFrame.Select(Split(Col("value"), " ").Alias("words"));
words.Show(5);
// Expand the array of words
DataFrame wordList = words.Select(Explode(Col("words")).Alias("word"));
wordList.Show(10);
// Group by word, and count the occurrences
DataFrame wordCount = wordList.GroupBy("word").Count();
wordCount.Show(10);
// Sort by count
DataFrame sortedWordCount = wordCount.OrderBy(Col("count").Desc());
sortedWordCount.Show(10);
// create word counts in a single statement
DataFrame result =
dataFrame
.Select(Split(Col("value"), " ").Alias("words"))
.Select(Explode(Col("words")).Alias("word"))
.GroupBy("word")
.Count()
.OrderBy(Col("count").Desc());
result.Show(10);
// Stop Spark session
spark.Stop();
}
}
}