{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# 引入pyspark,并创建spark上下文\n",
"import findspark\n",
"findspark.init()\n",
"import pyspark\n",
"sc = pyspark.SparkContext()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. 创建RDD的第一种方式,读外部数据,比如本地磁盘文件"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"rdd = sc.textFile('./dataset/Goodbye_Object_Oriented_Programming.txt')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pyspark.rdd.RDD"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 查看rdd类型\n",
"type(rdd)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.1 RDD之转换(Transformation)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
"Wall time: 31 µs\n"
]
}
],
"source": [
"%%time\n",
"## map是转换操作的一种,这时候只是形成DAG\n",
"rdd = rdd.map(lambda x: len(x))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.2 RDD之行动(Action)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"13187\n",
"CPU times: user 12 ms, sys: 0 ns, total: 12 ms\n",
"Wall time: 1.58 s\n"
]
}
],
"source": [
"%%time\n",
"## reduce是行动操作的一种, 这个时候才真正的计算\n",
"charCount = rdd.reduce(lambda x, y: x+y)\n",
"\n",
"print(charCount)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 328 2260 13687 ./dataset/Goodbye_Object_Oriented_Programming.txt\r\n"
]
}
],
"source": [
"! wc ./dataset/Goodbye_Object_Oriented_Programming.txt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.3 示例:统计单词出现的次数"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['I’ve been programming in Object Oriented languages for decades. The first OO language I used was C++ and then Smalltalk and finally .NET and Java.',\n",
" '']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wordRdd = sc.textFile('./dataset/Goodbye_Object_Oriented_Programming.txt')\n",
"\n",
"# take操作就是一种Action, 返回前n数据\n",
"wordRdd.take(2) "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# 将每一行文本打散\n",
"wordRdd = wordRdd.map(lambda line: line.split(' '))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['I’ve',\n",
" 'been',\n",
" 'programming',\n",
" 'in',\n",
" 'Object',\n",
" 'Oriented',\n",
" 'languages',\n",
" 'for',\n",
" 'decades.',\n",
" 'The',\n",
" 'first',\n",
" 'OO',\n",
" 'language',\n",
" 'I',\n",
" 'used',\n",
" 'was',\n",
" 'C++',\n",
" 'and',\n",
" 'then',\n",
" 'Smalltalk',\n",
" 'and',\n",
" 'finally',\n",
" '.NET',\n",
" 'and',\n",
" 'Java.'],\n",
" ['']]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wordRdd.take(2)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2493"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 扁平化处理\n",
"\n",
"wordRdd = wordRdd.flatMap(lambda x: x)\n",
"\n",
"# 查看有多少个单词\n",
"wordRdd.count()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['I’ve', 'been']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 查看前两条数据\n",
"wordRdd.take(2)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2260"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 过滤掉空格数据\n",
"wordRdd = wordRdd.filter(lambda x: x != '')\n",
"\n",
"# 查看有多少个单词\n",
"wordRdd.count()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('I’ve', 1), ('been', 1)]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 转换成key-value形式rdd 即 (key, value)\n",
"wordRdd = wordRdd.map(lambda word: (word, 1))\n",
"\n",
"wordRdd.take(2)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('face', 1),\n",
" ('was', 18),\n",
" ('Monkey', 2),\n",
" ('how', 4),\n",
" ('Just', 1),\n",
" ('for', 11),\n",
" ('Directories', 1),\n",
" ('could', 4),\n",
" ('gained', 1),\n",
" ('AGAIN', 1)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wordRdd = wordRdd.reduceByKey(lambda x, y: x+y)\n",
"\n",
"# 查看一下\n",
"wordRdd.take(10)\n",
"\n",
"# 查看全部\n",
"# wordRdd.collect()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# 使用pandas继续计算\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" word | \n",
" count | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" face | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" was | \n",
" 18 | \n",
"
\n",
" \n",
" 2 | \n",
" Monkey | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" how | \n",
" 4 | \n",
"
\n",
" \n",
" 4 | \n",
" Just | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" for | \n",
" 11 | \n",
"
\n",
" \n",
" 6 | \n",
" Directories | \n",
" 1 | \n",
"
\n",
" \n",
" 7 | \n",
" could | \n",
" 4 | \n",
"
\n",
" \n",
" 8 | \n",
" gained | \n",
" 1 | \n",
"
\n",
" \n",
" 9 | \n",
" AGAIN | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" word count\n",
"0 face 1\n",
"1 was 18\n",
"2 Monkey 2\n",
"3 how 4\n",
"4 Just 1\n",
"5 for 11\n",
"6 Directories 1\n",
"7 could 4\n",
"8 gained 1\n",
"9 AGAIN 1"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(wordRdd.collect())\n",
"\n",
"# 设置栏位名字\n",
"df.columns = ['word', 'count']\n",
"\n",
"\n",
"# 查看前10条数据\n",
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" word | \n",
" count | \n",
"
\n",
" \n",
" \n",
" \n",
" 263 | \n",
" the | \n",
" 121 | \n",
"
\n",
" \n",
" 271 | \n",
" to | \n",
" 57 | \n",
"
\n",
" \n",
" 576 | \n",
" of | \n",
" 47 | \n",
"
\n",
" \n",
" 358 | \n",
" and | \n",
" 45 | \n",
"
\n",
" \n",
" 589 | \n",
" a | \n",
" 41 | \n",
"
\n",
" \n",
" 797 | \n",
" is | \n",
" 38 | \n",
"
\n",
" \n",
" 136 | \n",
" in | \n",
" 35 | \n",
"
\n",
" \n",
" 593 | \n",
" I | \n",
" 32 | \n",
"
\n",
" \n",
" 685 | \n",
" that | \n",
" 29 | \n",
"
\n",
" \n",
" 645 | \n",
" The | \n",
" 26 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" word count\n",
"263 the 121\n",
"271 to 57\n",
"576 of 47\n",
"358 and 45\n",
"589 a 41\n",
"797 is 38\n",
"136 in 35\n",
"593 I 32\n",
"685 that 29\n",
"645 The 26"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 查看出现次数最多的十个单词\n",
"df =df.sort_values(by='count', ascending=False)\n",
"\n",
"\n",
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# 停止spark上下文\n",
"sc.stop()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}