{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# 引入pyspark,并创建spark上下文\n", "import findspark\n", "findspark.init()\n", "import pyspark\n", "sc = pyspark.SparkContext()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. 创建RDD的第一种方式,读外部数据,比如本地磁盘文件" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "rdd = sc.textFile('./dataset/Goodbye_Object_Oriented_Programming.txt')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pyspark.rdd.RDD" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 查看rdd类型\n", "type(rdd)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.1 RDD之转换(Transformation)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", "Wall time: 31 µs\n" ] } ], "source": [ "%%time\n", "## map是转换操作的一种,这时候只是形成DAG\n", "rdd = rdd.map(lambda x: len(x))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.2 RDD之行动(Action)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "13187\n", "CPU times: user 12 ms, sys: 0 ns, total: 12 ms\n", "Wall time: 1.58 s\n" ] } ], "source": [ "%%time\n", "## reduce是行动操作的一种, 这个时候才真正的计算\n", "charCount = rdd.reduce(lambda x, y: x+y)\n", "\n", "print(charCount)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 328 2260 13687 ./dataset/Goodbye_Object_Oriented_Programming.txt\r\n" ] } ], "source": [ "! wc ./dataset/Goodbye_Object_Oriented_Programming.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.3 示例:统计单词出现的次数" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['I’ve been programming in Object Oriented languages for decades. The first OO language I used was C++ and then Smalltalk and finally .NET and Java.',\n", " '']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wordRdd = sc.textFile('./dataset/Goodbye_Object_Oriented_Programming.txt')\n", "\n", "# take操作就是一种Action, 返回前n数据\n", "wordRdd.take(2) " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# 将每一行文本打散\n", "wordRdd = wordRdd.map(lambda line: line.split(' '))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['I’ve',\n", " 'been',\n", " 'programming',\n", " 'in',\n", " 'Object',\n", " 'Oriented',\n", " 'languages',\n", " 'for',\n", " 'decades.',\n", " 'The',\n", " 'first',\n", " 'OO',\n", " 'language',\n", " 'I',\n", " 'used',\n", " 'was',\n", " 'C++',\n", " 'and',\n", " 'then',\n", " 'Smalltalk',\n", " 'and',\n", " 'finally',\n", " '.NET',\n", " 'and',\n", " 'Java.'],\n", " ['']]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wordRdd.take(2)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2493" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 扁平化处理\n", "\n", "wordRdd = wordRdd.flatMap(lambda x: x)\n", "\n", "# 查看有多少个单词\n", "wordRdd.count()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['I’ve', 'been']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 查看前两条数据\n", "wordRdd.take(2)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2260" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 过滤掉空格数据\n", "wordRdd = wordRdd.filter(lambda x: x != '')\n", "\n", "# 查看有多少个单词\n", "wordRdd.count()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('I’ve', 1), ('been', 1)]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 转换成key-value形式rdd 即 (key, value)\n", "wordRdd = wordRdd.map(lambda word: (word, 1))\n", "\n", "wordRdd.take(2)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('face', 1),\n", " ('was', 18),\n", " ('Monkey', 2),\n", " ('how', 4),\n", " ('Just', 1),\n", " ('for', 11),\n", " ('Directories', 1),\n", " ('could', 4),\n", " ('gained', 1),\n", " ('AGAIN', 1)]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wordRdd = wordRdd.reduceByKey(lambda x, y: x+y)\n", "\n", "# 查看一下\n", "wordRdd.take(10)\n", "\n", "# 查看全部\n", "# wordRdd.collect()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# 使用pandas继续计算\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
0face1
1was18
2Monkey2
3how4
4Just1
5for11
6Directories1
7could4
8gained1
9AGAIN1
\n", "
" ], "text/plain": [ " word count\n", "0 face 1\n", "1 was 18\n", "2 Monkey 2\n", "3 how 4\n", "4 Just 1\n", "5 for 11\n", "6 Directories 1\n", "7 could 4\n", "8 gained 1\n", "9 AGAIN 1" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(wordRdd.collect())\n", "\n", "# 设置栏位名字\n", "df.columns = ['word', 'count']\n", "\n", "\n", "# 查看前10条数据\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
263the121
271to57
576of47
358and45
589a41
797is38
136in35
593I32
685that29
645The26
\n", "
" ], "text/plain": [ " word count\n", "263 the 121\n", "271 to 57\n", "576 of 47\n", "358 and 45\n", "589 a 41\n", "797 is 38\n", "136 in 35\n", "593 I 32\n", "685 that 29\n", "645 The 26" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 查看出现次数最多的十个单词\n", "df =df.sort_values(by='count', ascending=False)\n", "\n", "\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# 停止spark上下文\n", "sc.stop()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }