From 38ba8904e67ebc1188b60bdc0add1d10ce66c820 Mon Sep 17 00:00:00 2001 From: tink Date: Fri, 7 Jun 2024 16:05:44 +0800 Subject: [PATCH] add jupyter --- Makefile | 7 + .../Go-Frameworks-Github-Fork-Stats.ipynb | 232 + docs/jupyter/Pandas完全指南.ipynb | 8346 +++++++++++++++++ .../Spark上手示例1:RDD操作.ipynb | 610 ++ ...Spark上手示例2:DataFrame操作.ipynb | 1028 ++ docs/language/Go.md | 164 + mkdocs.yml | 17 +- 7 files changed, 10401 insertions(+), 3 deletions(-) create mode 100644 docs/jupyter/Go-Frameworks-Github-Fork-Stats.ipynb create mode 100644 docs/jupyter/Pandas完全指南.ipynb create mode 100644 docs/jupyter/Spark上手示例1:RDD操作.ipynb create mode 100644 docs/jupyter/Spark上手示例2:DataFrame操作.ipynb diff --git a/Makefile b/Makefile index 2dd6abb..ab9b3a2 100644 --- a/Makefile +++ b/Makefile @@ -13,3 +13,10 @@ html: publish: ssh root@www.cyub.vip "cd ${HTML_OUTPUT}; git pull" +plugin: + pip install mkdocs-git-revision-date-localized-plugin # 显示文档编辑时间 + pip install mkdocs-mermaid2-plugin # mermaid图表支持 + pip install mkdocs-charts-plugin # image box + pip install mkdocs-print-site-plugin # print site + pip install mkdocs-jupyter # jupyter + diff --git a/docs/jupyter/Go-Frameworks-Github-Fork-Stats.ipynb b/docs/jupyter/Go-Frameworks-Github-Fork-Stats.ipynb new file mode 100644 index 0000000..b15f6d9 --- /dev/null +++ b/docs/jupyter/Go-Frameworks-Github-Fork-Stats.ipynb @@ -0,0 +1,232 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.\n", + "Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.\n", + "To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.\n", + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: pandas in /home/deploy/.local/lib/python3.6/site-packages (1.0.0)\n", + "Requirement already satisfied: matplotlib in /home/deploy/.local/lib/python3.6/site-packages (3.1.3)\n", + "Requirement already satisfied: python-dateutil>=2.6.1 in /home/deploy/.local/lib/python3.6/site-packages (from pandas) (2.8.1)\n", + "Requirement already satisfied: pytz>=2017.2 in /home/deploy/.local/lib/python3.6/site-packages (from pandas) (2019.3)\n", + "Requirement already satisfied: numpy>=1.13.3 in /home/deploy/.local/lib/python3.6/site-packages (from pandas) (1.18.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /home/deploy/.local/lib/python3.6/site-packages (from matplotlib) (1.1.0)\n", + "Requirement already satisfied: cycler>=0.10 in /home/deploy/.local/lib/python3.6/site-packages (from matplotlib) (0.10.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /home/deploy/.local/lib/python3.6/site-packages (from matplotlib) (2.4.6)\n", + "Requirement already satisfied: six>=1.5 in /home/deploy/.local/lib/python3.6/site-packages (from python-dateutil>=2.6.1->pandas) (1.14.0)\n", + "Requirement already satisfied: setuptools in /home/deploy/.local/lib/python3.6/site-packages (from kiwisolver>=1.0.1->matplotlib) (45.1.0)\n" + ] + } + ], + "source": [ + "!pip install pandas matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
forksstarswatchsopenIssues
Gin4074354551212242
Beego4688232431268813
Iris1942175076835
Revel13571157555887
Echo15081650055146
Buffalo430537217170
\n", + "
" + ], + "text/plain": [ + " forks stars watchs openIssues\n", + "Gin 4074 35455 1212 242\n", + "Beego 4688 23243 1268 813\n", + "Iris 1942 17507 683 5\n", + "Revel 1357 11575 558 87\n", + "Echo 1508 16500 551 46\n", + "Buffalo 430 5372 171 70" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 统计go框架fork次数信息\n", + "\n", + "frameworks = {\n", + " \"Gin\":\"gin-gonic/gin\",\n", + " \"Beego\": \"astaxie/beego\",\n", + " \"Iris\": \"kataras/iris\",\n", + " \"Revel\": \"revel/revel\",\n", + " \"Echo\": \"labstack/echo\",\n", + " \"Buffalo\": \"gobuffalo/buffalo\"\n", + "}\n", + "\n", + "\n", + "stats = {}\n", + "for name in frameworks.keys():\n", + " url = \"https://api.github.com/repos/\" + frameworks[name]\n", + " stats[name] = requests.get(url=url).json() # 获取仓库统计信息\n", + "\n", + "indexs = []\n", + "forks = []\n", + "stars = []\n", + "watchs = []\n", + "openIssues = []\n", + "\n", + "for name in stats:\n", + " indexs += [name]\n", + " forks += [stats[name]['forks_count']] # fork次数\n", + " stars += [stats[name]['watchers_count']] # star次数\n", + " watchs += [stats[name]['subscribers_count']] # watch次数\n", + " openIssues += [stats[name]['open_issues_count']] # open_issue次数\n", + "\n", + "df = pd.DataFrame({\n", + " 'forks':forks,\n", + " 'stars':stars,\n", + " 'watchs':watchs,\n", + " 'openIssues': openIssues\n", + "}, index = indexs)\n", + "\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA3kAAAHrCAYAAABywVS0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3dfbhedXkn+u/NO/IOiYwQpjAt8v6SEBAbQ3k5YiieSqm2dECgqEwVS22nDDjjGZAWa49ombSKg4riVEXEUlFQjBoLzBEl0MhbcIyAJUAxBQwIxRL4nT/2It2GHZKdvcOz98rnc13Ptde611q/537iI9nfrLV+q1prAQAAoB82GHQDAAAAjB8hDwAAoEeEPAAAgB4R8gAAAHpEyAMAAOiRjQbdwNqaMmVK23XXXQfdBgAAwEDccsst/9xam7pyfdKGvF133TULFiwYdBsAAAADUVU/Hqnuck0AAIAeEfIAAAB6RMgDAADokUl7Tx4AADA4zzzzTJYsWZKnn3560K303mabbZZp06Zl4403XqP9hTwAAGDUlixZkq222iq77rprqmrQ7fRWay2PPPJIlixZkt12222NjnG5JgAAMGpPP/10dthhBwFvHauq7LDDDqM6YyrkAQAAa0XAe2mM9s9ZyAMAAOgR9+QBAABjtus514zrePe9/9jV7jN37txcfPHFmTFjRj7zmc+sdv9PfepTWbBgQf76r/96PFqcsIQ8AABgUvrIRz6Sb3zjG5k2bdpq912+fPlL0NHE4HJNAABg0vn93//93HPPPTnmmGPywQ9+MMcdd1z233//HHroobntttuSJOedd17e/OY3Z9asWXnzm9/8C8dfc801efWrX51//ud/zhe+8IXsu+++OeCAA3LYYYcN4uOMKyEPAACYdD760Y9mp512yvz583Pfffdl+vTpue222/K+970vJ5988or97rrrrnzjG9/I5z73uRW1q666Ku9///tz7bXXZsqUKTn//PNz3XXX5fvf/36uvvrqQXycceVyTQAAYFK78cYb88UvfjFJcuSRR+aRRx7J448/niT5jd/4jWy++eYr9v3Wt76VBQsW5Otf/3q23nrrJMmsWbNy6qmn5rd/+7dz/PHHv/QfYJw5kwcAAPTWFlts8Qvrv/zLv5wnnngi/+f//J8VtY9+9KP5sz/7s9x///056KCD8sgjj7zUbY4rIQ8AAJjUZs+evWJ2zW9/+9uZMmXKirN0K/ulX/qlfPGLX8zJJ5+cO++8M0nyox/9KK961aty/vnnZ+rUqbn//vtfst7XBZdrAgAAY7YmjzxYV84777ycdtpp2X///fOyl70sl1122Yvuv+eee+Yzn/lM3vSmN+XLX/5yzjrrrPzwhz9May1HHXVUDjjggJeo83WjWmuD7mGtzJw5sy1YsGDQbQAAwHpp0aJF2WuvvQbdxnpjpD/vqrqltTZz5X1drgkAANAjQh4AAECPuCdvojhvm3EYY9nYxwAAACY1Z/IAAAB6ZLUhr6o2q6rvVdX3q+rOqnpvV/9UVd1bVQu714FdvapqblUtrqrbqmrGsLFOqaofdq9ThtUPqqrbu2PmVlWtiw8LAADQd2tyuebPkxzZWvtZVW2c5Maq+mq37azW2pUr7X9Mkt2716uSXJzkVVW1fZJzk8xM0pLcUlVXt9Ye6/Z5W5LvJrk2yZwkXw0AAACjstqQ14aesfCzbnXj7vViz114Q5JPd8fdVFXbVtUrkhyeZF5r7dEkqap5SeZU1beTbN1au6mrfzrJcRHyAABg8hiPOSZ+YbzRzzdx0UUX5fTTT8/LXvay8e1lklmje/KqasOqWpjkJxkKat/tNl3QXZL5l1W1aVfbOcnwR8Qv6WovVl8yQn2kPk6vqgVVtWDp0qVr0joAALCeuOiii/LUU0+N6phnn312HXUzOGsU8lprz7bWDkwyLckhVbVvkncn2TPJwUm2T3L2Ouvy3/q4pLU2s7U2c+rUqev67QAAgAnqySefzLHHHpsDDjgg++67b9773vfmwQcfzBFHHJEjjjgiSfL2t789M2fOzD777JNzzz13xbG77rprzj777MyYMSNf+MIXMnfu3Oy9997Zf//9c8IJJwzqI42bUT1CobX206qan2ROa+3Crvzzqvpkkj/p1h9Issuww6Z1tQcydMnm8Pq3u/q0EfYHAAAY0de+9rXstNNOueaaa5Iky5Ytyyc/+cnMnz8/U6ZMSZJccMEF2X777fPss8/mqKOOym233Zb9998/SbLDDjvk1ltvTZLstNNOuffee7Ppppvmpz/96WA+0Dhak9k1p1bVtt3y5klem+Tu7j67dDNhHpfkju6Qq5Oc3M2yeWiSZa21h5Jcl+ToqtquqrZLcnSS67ptj1fVod1YJyf50vh+TAAAoE/222+/zJs3L2effXZuuOGGbLPNC+8JvOKKKzJjxoxMnz49d955Z+66664V237nd35nxfL++++fE088MX/zN3+TjTaa/I8SX5NP8Iokl1XVhhkKhVe01r5SVd+qqqlJKsnCJL/f7X9tkl9PsjjJU0l+L0laa49W1Z8mubnb7/znJ2FJ8o4kn0qyeYYmXDHpCgAAsEqvfOUrc+utt+baa6/Ne97znhx11FG/sP3ee+/NhRdemJtvvjnbbbddTj311Dz99NMrtm+xxRYrlq+55ppcf/31+fKXv5wLLrggt99++6QOe2syu+ZtSaaPUD9yFfu3JGesYtulSS4dob4gyb6r6wUAACBJHnzwwWy//fY56aSTsu222+bjH/94ttpqqzzxxBOZMmVKHn/88WyxxRbZZptt8vDDD+erX/1qDj/88BeM89xzz+X+++/PEUcckde85jW5/PLL87Of/SzbbrvtS/+hxsnkjacAAMDEsRaPPBiL22+/PWeddVY22GCDbLzxxrn44ovzne98J3PmzMlOO+2U+fPnZ/r06dlzzz2zyy67ZNasWSOO8+yzz+akk07KsmXL0lrLmWeeOakDXpLU0Im3yWfmzJltwYIFg25j/IzHc0Ve4v9jAQCw/lq0aFH22muvQbex3hjpz7uqbmmtzVx53zV6hAIAAACTg5AHAADQI0IeAABAjwh5AAAAPSLkAQAA9IiQBwAA0COekwcAAIzZfpftN67j3X7K7eM63vMuuuiinH766XnZy162yn3OO++8bLnllvmTP/mTddLDuuZMHgAAsN646KKL8tRTTw26jXVKyAMAACadD3zgA5k7d26S5I/+6I9y5JFHJkm+9a1v5cQTT8zb3/72zJw5M/vss0/OPffcJMncuXPz4IMP5ogjjsgRRxyRJPna176WGTNm5IADDshRRx21Yvy77rorhx9+eP7Df/gPK97nySefzLHHHpsDDjgg++67bz7/+c+/lB95jblcEwAAmHRmz56dD37wgznzzDOzYMGC/PznP88zzzyTG264IYcddlje9KY3Zfvtt8+zzz6bo446KrfddlvOPPPMfOhDH8r8+fMzZcqULF26NG9729ty/fXXZ7fddsujjz66Yvy777478+fPzxNPPJE99tgjb3/72/O1r30tO+20U6655pokybJlywb18V+UM3kAAMCkc9BBB+WWW27J448/nk033TSvfvWrs2DBgtxwww2ZPXt2rrjiisyYMSPTp0/PnXfembvuuusFY9x000057LDDsttuuyVJtt9++xXbjj322Gy66aaZMmVKXv7yl+fhhx/Ofvvtl3nz5uXss8/ODTfckG222eYl+7yjIeQBAACTzsYbb5zddtstn/rUp/Krv/qrmT17dubPn5/Fixdn8803z4UXXphvfvObue2223Lsscfm6aefHtX4m2666YrlDTfcMMuXL88rX/nK3Hrrrdlvv/3ynve8J+eff/54f6xxIeQBAACT0uzZs3PhhRfmsMMOy+zZs/PRj34006dPz+OPP54tttgi22yzTR5++OF89atfXXHMVlttlSeeeCJJcuihh+b666/PvffemyS/cLnmSB588MG87GUvy0knnZSzzjort95667r7cGPgnjwAAGDM1tUjD17M7Nmzc8EFF+TVr351tthii2y22WaZPXt2DjjggEyfPj177rlndtlll8yaNWvFMaeffnrmzJmTnXbaKfPnz88ll1yS448/Ps8991xe/vKXZ968eat8v9tvvz1nnXVWNthgg2y88ca5+OKLX4qPOWrVWht0D2tl5syZbcGCBYNuY/ycNw7X8543MW/8BACgfxYtWpS99tpr0G2sN0b6866qW1prM1fe1+WaAAAAPSLkAQAA9IiQBwAA0CNCHgAAQI8IeQAAAD0i5AEAAPSI5+QBAABjtmjP8X2cwl53LxrX8Ubr29/+di688MJ85StfGWgfa8OZPAAAgB4R8gAAgEnpQx/6UPbdd9/su+++ueiii3Lfffdlzz33zIknnpi99torb3zjG/PUU08lSW655Zb82q/9Wg466KC87nWvy0MPPZQkOfzww3P22WfnkEMOyStf+crccMMNL3ifv//7v8+BBx6YAw88MNOnT88TTzyRhx56KIcddlgOPPDA7LvvviuO23LLLVccd+WVV+bUU09NkixdujS/9Vu/lYMPPjgHH3xw/vf//t+rHHushDwAAGDSueWWW/LJT34y3/3ud3PTTTflYx/7WB577LH84Ac/yDve8Y4sWrQoW2+9dT7ykY/kmWeeyR/8wR/kyiuvzC233JLTTjst/+2//bcVYy1fvjzf+973ctFFF+W9733vC97rwgsvzIc//OEsXLgwN9xwQzbffPN89rOfzete97osXLgw3//+93PggQe+aL9/+Id/mD/6oz/KzTffnC9+8Yt561vfusqxx8o9eQAAwKRz44035jd/8zezxRZbJEmOP/743HDDDdlll10ya9asJMlJJ52UuXPnZs6cObnjjjvy2te+Nkny7LPP5hWveMWKsY4//vgkyUEHHZT77rvvBe81a9as/PEf/3FOPPHEHH/88Zk2bVoOPvjgnHbaaXnmmWdy3HHHrTbkfeMb38hdd921Yv3xxx/Pz372sxHHHitn8gAAgN6oqhest9ayzz77ZOHChVm4cGFuv/32fP3rX1+xz6abbpok2XDDDbN8+fIXjHnOOefk4x//eP7lX/4ls2bNyt13353DDjss119/fXbeeeeceuqp+fSnP/2C93/66adXLD/33HO56aabVvTwwAMPZMsttxxx7LES8gAAgEln9uzZ+bu/+7s89dRTefLJJ3PVVVdl9uzZ+cd//Md85zvfSZJ89rOfzWte85rsscceWbp06Yr6M888kzvvvHON3+tHP/pR9ttvv5x99tk5+OCDc/fdd+fHP/5xdtxxx7ztbW/LW9/61tx6661Jkh133DGLFi3Kc889l6uuumrFGEcffXT+6q/+asX6woULVzn2WLlcEwAAGLOX+pEHM2bMyKmnnppDDjkkSfLWt7412223XfbYY498+MMfzmmnnZa99947b3/727PJJpvkyiuvzJlnnplly5Zl+fLlede73pV99tlnjd7roosuyvz587PBBhtkn332yTHHHJPLL788H/jAB7Lxxhtnyy23XHEm7/3vf39e//rXZ+rUqZk5c2Z+9rOfJUnmzp2bM844I/vvv3+WL1+eww47LB/96EdHHHusqrU25kEGYebMmW3BggWDbmP8nLfNOIyxbOxjAADAGli0aFH22mt8n403Vvfdd19e//rX54477hh0K+NupD/vqrqltTZz5X1drgkAANAjQh4AANALu+66ay/P4o2WkAcAAKyVyXrr12Qz2j9nIQ8AABi1zTbbLI888oigt4611vLII49ks802W+NjzK4JAACM2rRp07JkyZIsXbp00K303mabbTaqh6QLeQAAwKhtvPHG2W233QbdBiNwuSYAAECPCHkAAAA9IuQBAAD0iJAHAADQI0IeAABAjwh5AAAAPSLkAQAA9IiQBwAA0CNCHgAAQI8IeQAAAD0i5AEAAPSIkAcAANAjqw15VbVZVX2vqr5fVXdW1Xu7+m5V9d2qWlxVn6+qTbr6pt364m77rsPGendX/0FVvW5YfU5XW1xV54z/xwQAAFg/rMmZvJ8nObK1dkCSA5PMqapDk/xFkr9srf1KkseSvKXb/y1JHuvqf9ntl6raO8kJSfZJMifJR6pqw6raMMmHkxyTZO8kv9vtCwAAwCitNuS1IT/rVjfuXi3JkUmu7OqXJTmuW35Dt55u+1FVVV398tbaz1tr9yZZnOSQ7rW4tXZPa+1fk1ze7QsAAMAordE9ed0Zt4VJfpJkXpIfJflpa215t8uSJDt3yzsnuT9Juu3LkuwwvL7SMauqj9TH6VW1oKoWLF26dE1aBwAAWK+sUchrrT3bWjswybQMnXnbc512teo+LmmtzWytzZw6deogWgAAAJjQRjW7Zmvtp0nmJ3l1km2raqNu07QkD3TLDyTZJUm67dskeWR4faVjVlUHAABglNZkds2pVbVtt7x5ktcmWZShsPfGbrdTknypW766W0+3/VuttdbVT+hm39wtye5Jvpfk5iS7d7N1bpKhyVmuHo8PBwAAsL7ZaPW75BVJLutmwdwgyRWtta9U1V1JLq+qP0vyD0k+0e3/iST/q6oWJ3k0Q6EtrbU7q+qKJHclWZ7kjNbas0lSVe9Mcl2SDZNc2lq7c9w+IQAAwHpktSGvtXZbkukj1O/J0P15K9efTvKmVYx1QZILRqhfm+TaNegXAACAFzGqe/IAAACY2IQ8AACAHhHyAAAAekTIAwAA6BEhDwAAoEeEPAAAgB4R8gAAAHpEyAMAAOgRIQ8AAKBHhDwAAIAeEfIAAAB6RMgDAADoESEPAACgR4Q8AACAHhHyAAAAekTIAwAA6BEhDwAAoEeEPAAAgB4R8gAAAHpEyAMAAOgRIQ8AAKBHhDwAAIAeEfIAAAB6RMgDAADoESEPAACgR4Q8AACAHhHyAAAAekTIAwAA6BEhDwAAoEeEPAAAgB4R8gAAAHpEyAMAAOgRIQ8AAKBHhDwAAIAeEfIAAAB6RMgDAADoESEPAACgR4Q8AACAHhHyAAAAekTIAwAA6BEhDwAAoEeEPAAAgB4R8gAAAHpEyAMAAOgRIQ8AAKBHhDwAAIAeEfIAAAB6RMgDAADoESEPAACgR1Yb8qpql6qaX1V3VdWdVfWHXf28qnqgqhZ2r18fdsy7q2pxVf2gql43rD6nqy2uqnOG1Xerqu929c9X1Sbj/UEBAADWB2tyJm95kv/cWts7yaFJzqiqvbttf9laO7B7XZsk3bYTkuyTZE6Sj1TVhlW1YZIPJzkmyd5JfnfYOH/RjfUrSR5L8pZx+nwAAADrldWGvNbaQ621W7vlJ5IsSrLzixzyhiSXt9Z+3lq7N8niJId0r8WttXtaa/+a5PIkb6iqSnJkkiu74y9LctzafiAAAID12ajuyauqXZNMT/LdrvTOqrqtqi6tqu262s5J7h922JKutqr6Dkl+2lpbvlJ9pPc/vaoWVNWCpUuXjqZ1AACA9cIah7yq2jLJF5O8q7X2eJKLk/xykgOTPJTkg+ukw2Faa5e01ma21mZOnTp1Xb8dAADApLPRmuxUVRtnKOB9prX2t0nSWnt42PaPJflKt/pAkl2GHT6tq2UV9UeSbFtVG3Vn84bvDwAAwCisyeyaleQTSRa11j40rP6KYbv9ZpI7uuWrk5xQVZtW1W5Jdk/yvSQ3J9m9m0lzkwxNznJ1a60lmZ/kjd3xpyT50tg+FgAAwPppTc7kzUry5iS3V9XCrvZfMzQ75oFJWpL7kvynJGmt3VlVVyS5K0Mzc57RWns2SarqnUmuS7Jhkktba3d2452d5PKq+rMk/5ChUAkAAMAorTbktdZuTFIjbLr2RY65IMkFI9SvHem41to9GZp9EwAAgDEY1eyaAAAATGxCHgAAQI8IeQAAAD0i5AEAAPSIkAcAANAjQh4AAECPCHkAAAA9IuQBAAD0iJAHAADQI0IeAABAjwh5AAAAPSLkAQAA9IiQBwAA0CNCHgAAQI8IeQAAAD2y0aAbANbCeduMwxjLxj4GAAATjjN5AAAAPSLkAQAA9IiQBwAA0CNCHgAAQI8IeQAAAD0i5AEAAPSIkAcAANAjQh4AAECPCHkAAAA9IuQBAAD0iJAHAADQI0IeAABAjwh5AAAAPSLkAQAA9IiQBwAA0CNCHgAAQI8IeQAAAD0i5AEAAPSIkAcAANAjQh4AAECPCHkAAAA9IuQBAAD0iJAHAADQI0IeAABAjwh5AAAAPSLkAQAA9IiQBwAA0CNCHgAAQI8IeQAAAD0i5AEAAPSIkAcAANAjQh4AAECPCHkAAAA9stqQV1W7VNX8qrqrqu6sqj/s6ttX1byq+mH3c7uuXlU1t6oWV9VtVTVj2FindPv/sKpOGVY/qKpu746ZW1W1Lj4sAABA363JmbzlSf5za23vJIcmOaOq9k5yTpJvttZ2T/LNbj1Jjkmye/c6PcnFyVAoTHJuklclOSTJuc8Hw26ftw07bs7YPxoAAMD6Z7Uhr7X2UGvt1m75iSSLkuyc5A1JLut2uyzJcd3yG5J8ug25Kcm2VfWKJK9LMq+19mhr7bEk85LM6bZt3Vq7qbXWknx62FgAAACMwqjuyauqXZNMT/LdJDu21h7qNv1Tkh275Z2T3D/ssCVd7cXqS0aoAwAAMEprHPKqasskX0zyrtba48O3dWfg2jj3NlIPp1fVgqpasHTp0nX9dgAAAJPOGoW8qto4QwHvM621v+3KD3eXWqb7+ZOu/kCSXYYdPq2rvVh92gj1F2itXdJam9lamzl16tQ1aR0AAGC9siaza1aSTyRZ1Fr70LBNVyd5fobMU5J8aVj95G6WzUOTLOsu67wuydFVtV034crRSa7rtj1eVYd273XysLEAAAAYhY3WYJ9ZSd6c5PaqWtjV/muS9ye5oqrekuTHSX6723Ztkl9PsjjJU0l+L0laa49W1Z8mubnb7/zW2qPd8juSfCrJ5km+2r0AAAAYpdWGvNbajUlW9dy6o0bYvyU5YxVjXZrk0hHqC5Lsu7peAAAAeHGjml0TAACAiU3IAwAA6BEhDwAAoEeEPAAAgB4R8gAAAHpEyAMAAOgRIQ8AAKBHhDwAAIAeEfIAAAB6RMgDAADoESEPAACgR4Q8AACAHhHyAAAAekTIAwAA6BEhDwAAoEeEPAAAgB4R8gAAAHpEyAMAAOgRIQ8AAKBHhDwAAIAeEfIAAAB6RMgDAADoESEPAACgR4Q8AACAHtlo0A0AsA6dt804jLFs7GMAAC8ZZ/IAAAB6RMgDAADoESEPAACgR4Q8AACAHhHyAAAAekTIAwAA6BEhDwAAoEeEPAAAgB4R8gAAAHpEyAMAAOgRIQ8AAKBHNhp0AwAATDLnbTMOYywb+xjAiJzJAwAA6BEhDwAAoEeEPAAAgB4R8gAAAHpEyAMAAOgRIQ8AAKBHhDwAAIAeEfIAAAB6RMgDAADoESEPAACgR4Q8AACAHhHyAAAAekTIAwAA6JHVhryqurSqflJVdwyrnVdVD1TVwu7168O2vbuqFlfVD6rqdcPqc7ra4qo6Z1h9t6r6blf/fFVtMp4fEAAAYH2yJmfyPpVkzgj1v2ytHdi9rk2Sqto7yQlJ9umO+UhVbVhVGyb5cJJjkuyd5He7fZPkL7qxfiXJY0neMpYPBAAAsD5bbchrrV2f5NE1HO8NSS5vrf28tXZvksVJDulei1tr97TW/jXJ5UneUFWV5MgkV3bHX5bkuFF+BgAAADpjuSfvnVV1W3c553Zdbeck9w/bZ0lXW1V9hyQ/ba0tX6k+oqo6vaoWVNWCpUuXjqF1AACAflrbkHdxkl9OcmCSh5J8cNw6ehGttUtaazNbazOnTp36UrwlAADApLLR2hzUWnv4+eWq+liSr3SrDyTZZdiu07paVlF/JMm2VbVRdzZv+P4AAACM0lqdyauqVwxb/c0kz8+8eXWSE6pq06raLcnuSb6X5OYku3czaW6SoclZrm6ttSTzk7yxO/6UJF9am54AAABYgzN5VfW5JIcnmVJVS5Kcm+TwqjowSUtyX5L/lCSttTur6ookdyVZnuSM1tqz3TjvTHJdkg2TXNpau7N7i7OTXF5Vf5bkH5J8Ytw+HQAAwHpmtSGvtfa7I5RXGcRaaxckuWCE+rVJrh2hfk+GZt8EAABgjMYyuyYAAAATjJAHAADQI0IeAABAjwh5AAAAPSLkAQAA9IiQBwAA0CNCHgAAQI8IeQAAAD0i5AEAAPSIkAcAANAjQh4AAECPCHkAAAA9IuQBAAD0iJAHAADQI0IeAABAjwh5AAAAPSLkAQAA9IiQBwAA0CNCHgAAQI8IeQAAAD0i5AEAAPSIkAcAANAjQh4AAECPCHkAAAA9IuQBAAD0iJAHAADQI0IeAABAjwh5AAAAPSLkAQAA9IiQBwAA0CNCHgAAQI8IeQAAAD0i5AEAAPSIkAcAANAjGw26AQBggjhvm3EYY9nYxwBgTJzJAwAA6BEhDwAAoEeEPAAAgB4R8gAAAHpEyAMAAOgRIQ8AAKBHhDwAAIAeEfIAAAB6RMgDAADoESEPAACgR4Q8AACAHhHyAAAAekTIAwAA6BEhDwAAoEdWG/Kq6tKq+klV3TGstn1VzauqH3Y/t+vqVVVzq2pxVd1WVTOGHXNKt/8Pq+qUYfWDqur27pi5VVXj/SEBAADWF2tyJu9TSeasVDsnyTdba7sn+Wa3niTHJNm9e52e5OJkKBQmOTfJq5IckuTc54Nht8/bhh238nsBAACwhlYb8lpr1yd5dKXyG5Jc1i1fluS4YfVPtyE3Jdm2ql6R5HVJ5rXWHm2tPZZkXpI53batW2s3tdZakk8PGwsAAIBRWtt78nZsrT3ULf9Tkh275Z2T3D9svyVd7cXqS0aoj6iqTq+qBVW1YOnSpWvZOgAAQH+NeeKV7gxcG4de1uS9LmmtzWytzZw6depL8ZYAAACTytqGvIe7Sy3T/fxJV38gyS7D9pvW1V6sPm2EOgAAAGthbUPe1UmenyHzlCRfGlY/uZtl89Aky7rLOq9LcnRVbddNuHJ0kuu6bY9X1aHdrJonDxsLAACAUdpodaMiEJ0AABAHSURBVDtU1eeSHJ5kSlUtydAsme9PckVVvSXJj5P8drf7tUl+PcniJE8l+b0kaa09WlV/muTmbr/zW2vPT+byjgzN4Ll5kq92LwAAANbCakNea+13V7HpqBH2bUnOWMU4lya5dIT6giT7rq4PAAAAVm/ME68AAAAwcQh5AAAAPSLkAQAA9IiQBwAA0CNCHgAAQI8IeQAAAD0i5AEAAPSIkAcAANAjQh4AAECPCHkAAAA9IuQBAAD0iJAHAADQI0IeAABAjwh5AAAAPSLkAQAA9IiQBwAA0CNCHgAAQI8IeQAAAD0i5AEAAPSIkAcAANAjQh4AAECPCHkAAAA9IuQBAAD0iJAHAADQI0IeAABAjwh5AAAAPSLkAQAA9IiQBwAA0CNCHgAAQI8IeQAAAD0i5AEAAPSIkAcAANAjQh4AAECPCHkAAAA9IuQBAAD0iJAHAADQI0IeAABAjwh5AAAAPSLkAQAA9IiQBwAA0CNCHgAAQI8IeQAAAD0i5AEAAPSIkAcAANAjQh4AAECPCHkAAAA9IuQBAAD0iJAHAADQI0IeAABAj2w06AYAAIAeO2+bcRhj2djHWI+M6UxeVd1XVbdX1cKqWtDVtq+qeVX1w+7ndl29qmpuVS2uqtuqasawcU7p9v9hVZ0yto8EAACw/hqPyzWPaK0d2Fqb2a2fk+SbrbXdk3yzW0+SY5Ls3r1OT3JxMhQKk5yb5FVJDkly7vPBEAAAgNFZF/fkvSHJZd3yZUmOG1b/dBtyU5Jtq+oVSV6XZF5r7dHW2mNJ5iWZsw76AgAA6L2xhryW5OtVdUtVnd7VdmytPdQt/1OSHbvlnZPcP+zYJV1tVfUXqKrTq2pBVS1YunTpGFsHAADon7FOvPKa1toDVfXyJPOq6u7hG1trraraGN9j+HiXJLkkSWbOnDlu4wIAAPTFmM7ktdYe6H7+JMlVGbqn7uHuMsx0P3/S7f5Akl2GHT6tq62qDgAAwCitdcirqi2qaqvnl5McneSOJFcneX6GzFOSfKlbvjrJyd0sm4cmWdZd1nldkqOrartuwpWjuxoAAACjNJbLNXdMclVVPT/OZ1trX6uqm5NcUVVvSfLjJL/d7X9tkl9PsjjJU0l+L0laa49W1Z8mubnb7/zW2qNj6AsmrF3PuWZcxrlvs3EZBgCAHlrrkNdauyfJASPUH0ly1Aj1luSMVYx1aZJL17YXAAAAhqyLRygAAAAwIEIeAABAjwh5AAAAPSLkAQAA9IiQBwAA0CNjeYQCnfGYFt+U+AAAwHhwJg8AAKBHhDwAAIAeEfIAAAB6RMgDAADoESEPAACgR4Q8AACAHhHyAAAAekTIAwAA6BEhDwAAoEeEPAAAgB4R8gAAAHpEyAMAAOgRIQ8AAKBHhDwAAIAeEfIAAAB6RMgDAADoESEPAACgR4Q8AACAHhHyAAAAekTIAwAA6BEhDwAAoEeEPAAAgB4R8gAAAHpEyAMAAOiRjQbdAAAj2/Wca8Y8xn2bjUMjAMCk4kweAABAjwh5AAAAPSLkAQAA9IiQBwAA0CMmXgEAWI+Y1An6T8gDgEluPH5pT/ziDtAXQh6sp/a7bL8xj3H7KbePQycAAIwnIa9HxuOX9sQv7gAAMJmZeAUAAKBHhDwAAIAeEfIAAAB6RMgDAADoEROvAGtt0Z57jcs4e929aFzGAQBAyANgNczcCwCTi8s1AQAAesSZPABg3IzHmV9nfdcPrhKAdUfIAwAAXmDXc64Zl3Hu22xchmEUXK4JAADQIxPmTF5VzUnyP5JsmOTjrbX3D7il9dZ4zJhotkQAABiMCRHyqmrDJB9O8tokS5LcXFVXt9buGmxnAADAoLnfd3QmRMhLckiSxa21e5Kkqi5P8oYkQh4ArGc8gxNYF9an/7ZUa23QPaSq3phkTmvtrd36m5O8qrX2zpX2Oz3J6d3qHkl+8JI2OvFNSfLPg26CScP3hTXlu8Jo+L6wpnxXGA3fl5H9Umtt6srFiXImb4201i5Jcsmg+5ioqmpBa23moPtgcvB9YU35rjAavi+sKd8VRsP3ZXQmyuyaDyTZZdj6tK4GAADAKEyUkHdzkt2rareq2iTJCUmuHnBPAAAAk86EuFyztba8qt6Z5LoMPULh0tbanQNuazJyKSuj4fvCmvJdYTR8X1hTviuMhu/LKEyIiVcAAAAYHxPlck0AAADGgZAHAADQI0IeAABAjwh5AAAAPTIhZtdkbKpqwyQ7Ztj/nq21fxxcR8BkV1WzkixsrT1ZVSclmZHkf7TWfjzg1oBJrqp+I8lh3erft9a+PMh+oI+cyZvkquoPkjycZF6Sa7rXVwbaFBNWVU2rqquqamlV/aSqvlhV0wbdFxPSxUmeqqoDkvznJD9K8unBtsREU1VPVNXj3euJYetPVNXjg+6Piaeq/jzJHya5q3udWVXvG2xXTFR+b1l7HqEwyVXV4iSvaq09MuhemPiqal6Szyb5X13ppCQnttZeO7iumIiq6tbW2oyq+u9JHmitfeL52qB7AyavqrotyYGttee69Q2T/ENrbf/BdsZE5PeWtedM3uR3f5Jlg26CSWNqa+2TrbXl3etTSaYOuikmpCeq6t0Z+gv1mqraIMnGA+6JCayqXlNVv9ctT6mq3QbdExPWtsOWtxlYF0wGfm9ZS+7Jm/zuSfLtqromyc+fL7bWPjS4lpjAHunur/pct/67SZwFZiS/k+Q/JnlLa+2fqurfJ/nAgHtigqqqc5PMTLJHkk8m2STJ3ySZNci+mJD+PMk/VNX8JJWhe/POGWxLTGB+b1lLLtec5Lq/WF+gtfbel7oXJr6q+qUkf5Xk1Ulakv8vyZkm6gHGoqoWJpme5NbW2vSudptL8BhJVb0iycHd6vdaa/80yH6YuPzesvaEPABWqKobW2uvqaonMvQX6opNSVprbesBtcYEVlXfa60dMuxezi2SfEfIYyRVtXOSX8ovzgp+/eA6gv5xueYkVVUXtdbeVVVfzi/+IpYkaa39xgDaYoKrqrkjlJclWdBa+9JL3Q8TT2vtNd3PrQbdC5PKFVX1P5NsW1VvS3Jako8NuCcmoKr6iwxdDn5nkue6cksi5LFCVf1VRvj99nmttTNfwnYmJSFv8np+lqELh9We/z9DvcS9MHlslmTPJF/o1n8ryb1JDqiqI1pr7xpYZ0wY3Wx3d7bW9hx0L0wOrbULq+q1SR7P0H15/721Nm/AbTExHZdkj9baz1e7J+uzBYNuYLIT8iavaVV1aGvtw8nQpTIZmm2oJTl7oJ0xke2fZFZr7dkkqaqLk9yQ5DVJbh9kY0wcrbVnq+oHVfXv3ffAmqiqP07yecGONXBPhmbqFfJYpdbaZYPuYbIT8iav/5LkhGHrm2RoZrMtMjSz2RdGOoj13nZJtsy/PXZjiyTbd7/U+wuX4bZLcmf3D0hPPl90KTirsFWSr1fVo0k+n+QLrbWHB9wTE8iwy++eSrKwqr6ZX5wV3OV3vEBVTc3QyYu9M3Q1UpKktXbkwJqaJIS8yWuT1tr9w9Zv7B6I/kh3wzuM5P/N0F+u386/TV39vu47841BNsaE8/8MugEmj25G5/dW1f4Zut/q76tqSWvt/xpwa0wcz19+d0uSqwfZCJPKZzL0D0fHJvn9JKckWTrQjiYJs2tOUlW1uLX2K6vY9qPW2i+/1D0xOXRTVx/Srd7cWntwkP0A/VFV/y7JmzJ0pclWZtdkZd0/Kj497LaBDZNs2lp7arCdMRFV1S2ttYOGP5Klqm5urR28umPXdxsMugHW2ne7Gcx+QVX9pyTfG0A/TAJVVUmOSnJAN5vmRlV1yGoOYz1SVU9U1eMjvJ6oqscH3R8TU1W9o7tC4JtJdkjyNgGPVfhmks2HrW8eV5Kwas90Px+qqmOranqS7QfZ0GThTN4kVVUvT/J3Gbqe/daufFCSTZMc514IRtJNtPJckiNba3tV1XZJvu5fxICxqKo/z9DEKwsH3QsTW1UtbK0duLoaJElVvT5DE8TtkqGHom+d5L2tNZf8roYzeZNUa+0nrbVfTfKnSe7rXue31l4t4PEiXtVaOyPJ00nSWnssQ5P2AKy11tq7k2xZVb+XDE2WUFW7DbgtJqYnq2rG8ytVdVCSfxlgP0xA3fMUk2Tz1tqy1todrbUjWmsHCXhrxpk8WI9U1XeT/GqG7sWb0c1a9fXW2vQBtwZMYlV1boZmeN6jtfbKqtopQzNszhpwa0wwVXVwksuTPJihCcD+XZITWmuei8YKVXV7hh77dEtrbcbq9ueFzK4J65e5Sa5KsmNVXZDkjUneM9iWgB74zSTT090+0Fp7sKq2GmxLTESttZuras8ke3SlH7TWnnmxY1gvfS3JYxm6QmD4/eCVpLXWth5MW5OHyzVhPdJa+0yGnrH4viQPZej+Tc9UBMbqX9vQpUEtWTGDIqxQVf9l2Opx3eV3d7TWnqmq9w2sMSaq97TWtk1yTWtt62GvrQS8NSPkwfpnSpKnWmt/neSf3TcDjIMrqup/Jtm2m/n5G0k+PuCemFhOGLb87pW2zXkpG2FS+E7306zOa8nlmrAeGX7fTJJPJtk4yd8kcd8MsNZaaxdW1Wsz9AvZHkn+e2tt3oDbYmKpVSyPtA6bVNV/TPKrVXX8yhtba387gJ4mFSEP1i/umwHWiS7UzUuSqtqgqk7sLhGHpLuUd4Tlkdbh95OcmGTbJP/3SttaEiFvNYQ8WL/8a2utVZX7ZoAxq6qtk5yRZOckV2co5J2R5E+SfD+JkMfzDugm0Kgkmw+bTKOSbDa4tpiIWms3Jrmxqha01j4x6H4mIyEP1i8r3zdzWpKPDbgnYPL6XxmaAe87Sd6a5L9m6Jf24zwYneFaaxsOugcmpWeq6uSVi621Tw+imcnEc/JgPdPdN3N0hn4Ru859M8DaqqrbW2v7dcsbZmjW3n/fWnt6sJ0BfVBVfzVsdbMkRyW5tbX2xgG1NGkIebCeqqopSR5p/iMArKWqunX4g4pXXgcYT1W1bZLLW2tmZF0Nj1CA9UBVHVpV366qv62q6VV1R5I7kjxcVf5DCaytA6rq8e71RJL9n19e6QHGAOPhySQe/bQG3JMH64e/ztC9Mtsk+VaSY1prN1XVnkk+l+Rrg2wOmJzcZwWsS1X15fzb7KsbJNk7yRWD62jycLkmrAeqamFr7cBueVFrba9h2/6htTZ9cN0BALxQVf3asNXlSX7cWlsyqH4mE2fyYP3w3LDlf1lpm3/pAQAmnNba3z+//PxcAgNsZ1JxJg/WA1X1bIauY68kmyd56vlNSTZrrW08qN4AAIarqkOTvD/Jo0n+NEOPa5mSoUs2T26tuc1kNYQ8AABgwqiqBfm3uQQuyUpzCbjNZPXMrgkAAEwkG7XWvt5a+0KSf2qt3ZQkrbW7B9zXpCHkAQAAE4m5BMbI5ZoAAMCEYS6BsRPyAAAAesTlmgAAAD0i5AEAAPSIkAcAANAjQh4AAECP/P/HtLLr5hUEvAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df.plot(kind='bar', figsize=(15, 8))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/jupyter/Pandas完全指南.ipynb b/docs/jupyter/Pandas完全指南.ipynb new file mode 100644 index 0000000..12eb55b --- /dev/null +++ b/docs/jupyter/Pandas完全指南.ipynb @@ -0,0 +1,8346 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 前言\n", + "\n", + "\n", + "Pandas 是一个Python语言实现的,开源,易于使用的数据架构以及数据分析工具。在Pandas中主要有两种数据类型,可以简单的理解为:\n", + "\n", + "- Series:一维数组(列表)\n", + "- DateFrame:二维数组(矩阵)\n", + "\n", + "在线实验:[Pandas完全指南.ipynb](https://nbviewer.jupyter.org/github/cyub/code-examples/blob/master/jupyter-notes/Pandas%E5%AE%8C%E5%85%A8%E6%8C%87%E5%8D%97.ipynb#)\n", + "\n", + "学习资料:\n", + "\n", + "- [Pandas中文文档](https://www.pypandas.cn/docs/)\n", + "- [利用Python进行数据分析·第2版](https://github.com/iamseancheney/python_for_data_analysis_2nd_chinese_version)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 导入pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# 安装pandas,matplotlib(绘图用) 包\n", + "!pip install pandas matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# 导入包\n", + "import pandas as pd\n", + "import numpy as np\n", + "from IPython.display import Image" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 创建列表" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 创建普通列表" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "s = pd.Series([1, 3, 6, np.nan, 23, 3]) # type(s) === 'pandas.core.series.Series'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 创建时间列表" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "dates = pd.date_range('20200101', periods=6)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 创建矩阵" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 根据列表(Series)创建矩阵" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
2020-01-012.078888-0.959554-0.3672651.108948
2020-01-02-0.4123620.232540-1.903134-1.831848
2020-01-03-1.8987210.9199760.4856300.758721
2020-01-040.4869610.3783230.1867270.671816
2020-01-050.702523-0.5567980.635000-0.118564
2020-01-060.654506-0.0007270.417828-0.611751
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "2020-01-01 2.078888 -0.959554 -0.367265 1.108948\n", + "2020-01-02 -0.412362 0.232540 -1.903134 -1.831848\n", + "2020-01-03 -1.898721 0.919976 0.485630 0.758721\n", + "2020-01-04 0.486961 0.378323 0.186727 0.671816\n", + "2020-01-05 0.702523 -0.556798 0.635000 -0.118564\n", + "2020-01-06 0.654506 -0.000727 0.417828 -0.611751" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df2 = pd.DataFrame({\n", + " 'a':pd.Series([1, 2, 3, 4]),\n", + " 'b':pd.Timestamp('20180708'),\n", + " 'c':pd.Categorical(['cate1', 'cate2', 'cate3', 'cate4'])\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abc
012018-07-08cate1
122018-07-08cate2
232018-07-08cate3
342018-07-08cate4
\n", + "
" + ], + "text/plain": [ + " a b c\n", + "0 1 2018-07-08 cate1\n", + "1 2 2018-07-08 cate2\n", + "2 3 2018-07-08 cate3\n", + "3 4 2018-07-08 cate4" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 根据字典创建矩阵" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy', 'Jack', 'Tim'], \n", + " 'age': [20, 32, 36, 24, 23, 18, 27], \n", + " 'gender': np.random.choice(['M','F'],size=7),\n", + " 'score': [25, 94, 57, 62, 70, 88, 67],\n", + " 'country': np.random.choice(['US','CN'],size=7),\n", + " }\n", + "df3 = pd.DataFrame(data, columns = ['name', 'age', 'gender', 'score', 'country'])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderscorecountry
0Jason20F25US
1Molly32F94CN
2Tina36F57CN
3Jake24F62US
4Amy23F70CN
5Jack18F88US
6Tim27F67CN
\n", + "
" + ], + "text/plain": [ + " name age gender score country\n", + "0 Jason 20 F 25 US\n", + "1 Molly 32 F 94 CN\n", + "2 Tina 36 F 57 CN\n", + "3 Jake 24 F 62 US\n", + "4 Amy 23 F 70 CN\n", + "5 Jack 18 F 88 US\n", + "6 Tim 27 F 67 CN" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 矩阵属性、检视数据" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 行数列数" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(6, 4)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 索引" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',\n", + " '2020-01-05', '2020-01-06'],\n", + " dtype='datetime64[ns]', freq='D')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 列名" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['a', 'b', 'c', 'd'], dtype='object')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 值" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 2.07888761e+00, -9.59553787e-01, -3.67264810e-01,\n", + " 1.10894771e+00],\n", + " [-4.12361501e-01, 2.32539690e-01, -1.90313388e+00,\n", + " -1.83184759e+00],\n", + " [-1.89872061e+00, 9.19975617e-01, 4.85630402e-01,\n", + " 7.58720982e-01],\n", + " [ 4.86960560e-01, 3.78322949e-01, 1.86726767e-01,\n", + " 6.71815555e-01],\n", + " [ 7.02523492e-01, -5.56797752e-01, 6.35000384e-01,\n", + " -1.18564302e-01],\n", + " [ 6.54506255e-01, -7.26685067e-04, 4.17828341e-01,\n", + " -6.11751157e-01]])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 矩阵信息" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "DatetimeIndex: 6 entries, 2020-01-01 to 2020-01-06\n", + "Freq: D\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 a 6 non-null float64\n", + " 1 b 6 non-null float64\n", + " 2 c 6 non-null float64\n", + " 3 d 6 non-null float64\n", + "dtypes: float64(4)\n", + "memory usage: 240.0 bytes\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 矩阵描述信息" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
count6.0000006.0000006.0000006.000000
mean0.2686330.002293-0.090869-0.003780
std1.3283840.6744320.9545441.095503
min-1.898721-0.959554-1.903134-1.831848
25%-0.187531-0.417780-0.228767-0.488454
50%0.5707330.1159070.3022780.276626
75%0.6905190.3418770.4686800.736995
max2.0788880.9199760.6350001.108948
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "count 6.000000 6.000000 6.000000 6.000000\n", + "mean 0.268633 0.002293 -0.090869 -0.003780\n", + "std 1.328384 0.674432 0.954544 1.095503\n", + "min -1.898721 -0.959554 -1.903134 -1.831848\n", + "25% -0.187531 -0.417780 -0.228767 -0.488454\n", + "50% 0.570733 0.115907 0.302278 0.276626\n", + "75% 0.690519 0.341877 0.468680 0.736995\n", + "max 2.078888 0.919976 0.635000 1.108948" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
2020-06-012.078888-0.959554-0.3672651.108948
2020-06-02-0.4123620.232540-1.903134-1.831848
2020-06-03-1.8987210.9199760.4856300.758721
2020-06-040.4869610.3783230.1867270.671816
2020-06-050.702523-0.5567980.635000-0.118564
2020-06-060.654506-0.0007270.417828-0.611751
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "2020-06-01 2.078888 -0.959554 -0.367265 1.108948\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848\n", + "2020-06-03 -1.898721 0.919976 0.485630 0.758721\n", + "2020-06-04 0.486961 0.378323 0.186727 0.671816\n", + "2020-06-05 0.702523 -0.556798 0.635000 -0.118564\n", + "2020-06-06 0.654506 -0.000727 0.417828 -0.611751" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### 更改索引\n", + "df.index = pd.date_range('2020/06/01', periods=df.shape[0])\n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### top5 数据" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
2020-06-012.078888-0.959554-0.3672651.108948
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "2020-06-01 2.078888 -0.959554 -0.367265 1.108948" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### tail5 数据" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
2020-06-02-0.4123620.232540-1.903134-1.831848
2020-06-03-1.8987210.9199760.4856300.758721
2020-06-040.4869610.3783230.1867270.671816
2020-06-050.702523-0.5567980.635000-0.118564
2020-06-060.654506-0.0007270.417828-0.611751
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848\n", + "2020-06-03 -1.898721 0.919976 0.485630 0.758721\n", + "2020-06-04 0.486961 0.378323 0.186727 0.671816\n", + "2020-06-05 0.702523 -0.556798 0.635000 -0.118564\n", + "2020-06-06 0.654506 -0.000727 0.417828 -0.611751" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.tail(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 某一列值统计" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-1.898721 1\n", + " 0.702523 1\n", + " 2.078888 1\n", + " 0.486961 1\n", + " 0.654506 1\n", + "-0.412362 1\n", + "Name: a, dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['a'].value_counts(dropna=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 查看每一列唯一值统计" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
-1.903134NaNNaN1.0NaN
-1.8987211.0NaNNaNNaN
-1.831848NaNNaNNaN1.0
-0.959554NaN1.0NaNNaN
-0.611751NaNNaNNaN1.0
-0.556798NaN1.0NaNNaN
-0.4123621.0NaNNaNNaN
-0.367265NaNNaN1.0NaN
-0.118564NaNNaNNaN1.0
-0.000727NaN1.0NaNNaN
0.186727NaNNaN1.0NaN
0.232540NaN1.0NaNNaN
0.378323NaN1.0NaNNaN
0.417828NaNNaN1.0NaN
0.485630NaNNaN1.0NaN
0.4869611.0NaNNaNNaN
0.635000NaNNaN1.0NaN
0.6545061.0NaNNaNNaN
0.671816NaNNaNNaN1.0
0.7025231.0NaNNaNNaN
0.758721NaNNaNNaN1.0
0.919976NaN1.0NaNNaN
1.108948NaNNaNNaN1.0
2.0788881.0NaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "-1.903134 NaN NaN 1.0 NaN\n", + "-1.898721 1.0 NaN NaN NaN\n", + "-1.831848 NaN NaN NaN 1.0\n", + "-0.959554 NaN 1.0 NaN NaN\n", + "-0.611751 NaN NaN NaN 1.0\n", + "-0.556798 NaN 1.0 NaN NaN\n", + "-0.412362 1.0 NaN NaN NaN\n", + "-0.367265 NaN NaN 1.0 NaN\n", + "-0.118564 NaN NaN NaN 1.0\n", + "-0.000727 NaN 1.0 NaN NaN\n", + " 0.186727 NaN NaN 1.0 NaN\n", + " 0.232540 NaN 1.0 NaN NaN\n", + " 0.378323 NaN 1.0 NaN NaN\n", + " 0.417828 NaN NaN 1.0 NaN\n", + " 0.485630 NaN NaN 1.0 NaN\n", + " 0.486961 1.0 NaN NaN NaN\n", + " 0.635000 NaN NaN 1.0 NaN\n", + " 0.654506 1.0 NaN NaN NaN\n", + " 0.671816 NaN NaN NaN 1.0\n", + " 0.702523 1.0 NaN NaN NaN\n", + " 0.758721 NaN NaN NaN 1.0\n", + " 0.919976 NaN 1.0 NaN NaN\n", + " 1.108948 NaN NaN NaN 1.0\n", + " 2.078888 1.0 NaN NaN NaN" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.apply(pd.Series.value_counts)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 排序" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 根据索引(index)排序" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
2020-06-060.654506-0.0007270.417828-0.611751
2020-06-050.702523-0.5567980.635000-0.118564
2020-06-040.4869610.3783230.1867270.671816
2020-06-03-1.8987210.9199760.4856300.758721
2020-06-02-0.4123620.232540-1.903134-1.831848
2020-06-012.078888-0.959554-0.3672651.108948
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "2020-06-06 0.654506 -0.000727 0.417828 -0.611751\n", + "2020-06-05 0.702523 -0.556798 0.635000 -0.118564\n", + "2020-06-04 0.486961 0.378323 0.186727 0.671816\n", + "2020-06-03 -1.898721 0.919976 0.485630 0.758721\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848\n", + "2020-06-01 2.078888 -0.959554 -0.367265 1.108948" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sort_index(axis=, ascending=)\n", + "# axis:0-行排序,1-列排序; ascending:True-升序,False-降序\n", + "df.sort_index(axis=0, ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dcba
2020-06-011.108948-0.367265-0.9595542.078888
2020-06-02-1.831848-1.9031340.232540-0.412362
2020-06-030.7587210.4856300.919976-1.898721
2020-06-040.6718160.1867270.3783230.486961
2020-06-05-0.1185640.635000-0.5567980.702523
2020-06-06-0.6117510.417828-0.0007270.654506
\n", + "
" + ], + "text/plain": [ + " d c b a\n", + "2020-06-01 1.108948 -0.367265 -0.959554 2.078888\n", + "2020-06-02 -1.831848 -1.903134 0.232540 -0.412362\n", + "2020-06-03 0.758721 0.485630 0.919976 -1.898721\n", + "2020-06-04 0.671816 0.186727 0.378323 0.486961\n", + "2020-06-05 -0.118564 0.635000 -0.556798 0.702523\n", + "2020-06-06 -0.611751 0.417828 -0.000727 0.654506" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sort_index(axis=1, ascending=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 根据值排序" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
2020-06-012.078888-0.959554-0.3672651.108948
2020-06-050.702523-0.5567980.635000-0.118564
2020-06-060.654506-0.0007270.417828-0.611751
2020-06-040.4869610.3783230.1867270.671816
2020-06-02-0.4123620.232540-1.903134-1.831848
2020-06-03-1.8987210.9199760.4856300.758721
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "2020-06-01 2.078888 -0.959554 -0.367265 1.108948\n", + "2020-06-05 0.702523 -0.556798 0.635000 -0.118564\n", + "2020-06-06 0.654506 -0.000727 0.417828 -0.611751\n", + "2020-06-04 0.486961 0.378323 0.186727 0.671816\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848\n", + "2020-06-03 -1.898721 0.919976 0.485630 0.758721" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sort_values(by='a', ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
2020-06-03-1.8987210.9199760.4856300.758721
2020-06-02-0.4123620.232540-1.903134-1.831848
2020-06-040.4869610.3783230.1867270.671816
2020-06-060.654506-0.0007270.417828-0.611751
2020-06-050.702523-0.5567980.635000-0.118564
2020-06-012.078888-0.959554-0.3672651.108948
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "2020-06-03 -1.898721 0.919976 0.485630 0.758721\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848\n", + "2020-06-04 0.486961 0.378323 0.186727 0.671816\n", + "2020-06-06 0.654506 -0.000727 0.417828 -0.611751\n", + "2020-06-05 0.702523 -0.556798 0.635000 -0.118564\n", + "2020-06-01 2.078888 -0.959554 -0.367265 1.108948" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sort_values(by=['a','b'], ascending=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 选取数据" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 选取某一列" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2020-06-01 2.078888\n", + "2020-06-02 -0.412362\n", + "2020-06-03 -1.898721\n", + "2020-06-04 0.486961\n", + "2020-06-05 0.702523\n", + "2020-06-06 0.654506\n", + "Freq: D, Name: a, dtype: float64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['a'] # 等效于df.a" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 根据索引选取某几行数据" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
2020-06-012.078888-0.959554-0.3672651.108948
2020-06-02-0.4123620.232540-1.903134-1.831848
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "2020-06-01 2.078888 -0.959554 -0.367265 1.108948\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['2020-06-01':'2020-06-02'] # 选取索引以2020-06-01开始,到2020-06-02结束的数据" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 根据列名选择某几列数据" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cb
2020-06-01-0.367265-0.959554
2020-06-02-1.9031340.232540
2020-06-030.4856300.919976
2020-06-040.1867270.378323
2020-06-050.635000-0.556798
2020-06-060.417828-0.000727
\n", + "
" + ], + "text/plain": [ + " c b\n", + "2020-06-01 -0.367265 -0.959554\n", + "2020-06-02 -1.903134 0.232540\n", + "2020-06-03 0.485630 0.919976\n", + "2020-06-04 0.186727 0.378323\n", + "2020-06-05 0.635000 -0.556798\n", + "2020-06-06 0.417828 -0.000727" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[['c', 'b']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 根据索引和列名选择数据\n", + "\n", + "loc[行名选择, 列名选择],未指定行名或列名,或者指定为:则表示选择当前所有行,或列" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "a 2.078888\n", + "b -0.959554\n", + "c -0.367265\n", + "d 1.108948\n", + "Name: 2020-06-01 00:00:00, dtype: float64" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc['2020-06-01']" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-0.9595537865841992" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc['2020-06-01', 'b']" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2020-06-01 -0.959554\n", + "2020-06-02 0.232540\n", + "2020-06-03 0.919976\n", + "2020-06-04 0.378323\n", + "2020-06-05 -0.556798\n", + "2020-06-06 -0.000727\n", + "Freq: D, Name: b, dtype: float64" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[:, 'b'] # type(df.loc[:, 'b']) === 'pandas.core.series.Series',而type(df.loc[:, ['b']]) === ’pandas.core.frame.DataFrame‘" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ab
2020-06-012.078888-0.959554
2020-06-02-0.4123620.232540
2020-06-03-1.8987210.919976
2020-06-040.4869610.378323
2020-06-050.702523-0.556798
2020-06-060.654506-0.000727
\n", + "
" + ], + "text/plain": [ + " a b\n", + "2020-06-01 2.078888 -0.959554\n", + "2020-06-02 -0.412362 0.232540\n", + "2020-06-03 -1.898721 0.919976\n", + "2020-06-04 0.486961 0.378323\n", + "2020-06-05 0.702523 -0.556798\n", + "2020-06-06 0.654506 -0.000727" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[:, ['a', 'b']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 根据行索引和列索引取数据" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2.0788876064798893" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.iloc[0,0] # === df.loc['2020-06-01', 'a']" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "a 2.078888\n", + "b -0.959554\n", + "c -0.367265\n", + "d 1.108948\n", + "Name: 2020-06-01 00:00:00, dtype: float64" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.iloc[0, :] # ==== df.loc['2020-06-01', :]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 根据布尔表达式表达式取数据\n", + "\n", + "只有当布尔表达式为真时的数据才会被选择" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
2020-06-012.078888-0.959554-0.3672651.108948
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "2020-06-01 2.078888 -0.959554 -0.367265 1.108948" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df.a > 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [a, b, c, d]\n", + "Index: []" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[(df['a'] > 1) & (df['d'] <0)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 添加/删除列、更新、替换数据" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 设置某矩阵项值" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc['2020-06-01', 'a'] = np.nan\n", + "df.loc['2020-06-06', 'c'] = np.nan" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcd
2020-06-01NaN-0.959554-0.3672651.108948
2020-06-02-0.4123620.232540-1.903134-1.831848
2020-06-03-1.8987210.9199760.4856300.758721
2020-06-040.4869610.3783230.1867270.671816
2020-06-050.702523-0.5567980.635000-0.118564
2020-06-060.654506-0.000727NaN-0.611751
\n", + "
" + ], + "text/plain": [ + " a b c d\n", + "2020-06-01 NaN -0.959554 -0.367265 1.108948\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848\n", + "2020-06-03 -1.898721 0.919976 0.485630 0.758721\n", + "2020-06-04 0.486961 0.378323 0.186727 0.671816\n", + "2020-06-05 0.702523 -0.556798 0.635000 -0.118564\n", + "2020-06-06 0.654506 -0.000727 NaN -0.611751" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 根据条件创建新列" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "df['e'] = np.where((df['a'] > 1) & (df['d']<0), 1, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcde
2020-06-01NaN-0.959554-0.3672651.1089480
2020-06-02-0.4123620.232540-1.903134-1.8318480
2020-06-03-1.8987210.9199760.4856300.7587210
2020-06-040.4869610.3783230.1867270.6718160
2020-06-050.702523-0.5567980.635000-0.1185640
2020-06-060.654506-0.000727NaN-0.6117510
\n", + "
" + ], + "text/plain": [ + " a b c d e\n", + "2020-06-01 NaN -0.959554 -0.367265 1.108948 0\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848 0\n", + "2020-06-03 -1.898721 0.919976 0.485630 0.758721 0\n", + "2020-06-04 0.486961 0.378323 0.186727 0.671816 0\n", + "2020-06-05 0.702523 -0.556798 0.635000 -0.118564 0\n", + "2020-06-06 0.654506 -0.000727 NaN -0.611751 0" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 根据已有列创建新列" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "tmp = df.copy()\n", + "df.loc[:,'f'] = tmp.apply(lambda row: row['b']+ row['d'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcdef
2020-06-01NaN-0.959554-0.3672651.10894800.149394
2020-06-02-0.4123620.232540-1.903134-1.8318480-1.599308
2020-06-03-1.8987210.9199760.4856300.75872101.678697
2020-06-040.4869610.3783230.1867270.67181601.050139
2020-06-050.702523-0.5567980.635000-0.1185640-0.675362
2020-06-060.654506-0.000727NaN-0.6117510-0.612478
\n", + "
" + ], + "text/plain": [ + " a b c d e f\n", + "2020-06-01 NaN -0.959554 -0.367265 1.108948 0 0.149394\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848 0 -1.599308\n", + "2020-06-03 -1.898721 0.919976 0.485630 0.758721 0 1.678697\n", + "2020-06-04 0.486961 0.378323 0.186727 0.671816 0 1.050139\n", + "2020-06-05 0.702523 -0.556798 0.635000 -0.118564 0 -0.675362\n", + "2020-06-06 0.654506 -0.000727 NaN -0.611751 0 -0.612478" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 替换数据" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcdef
2020-06-01NaN-0.959554-0.3672651.10894800.149394
2020-06-02-0.4123620.232540-1.903134-1.8318480-1.599308
2020-06-03-1.8987210.9199760.4856300.75872101.678697
2020-06-040.4869610.3783230.1867270.67181601.050139
2020-06-050.702523-0.5567980.635000-0.1185640-0.675362
2020-06-060.654506-0.000727NaN-0.6117510-0.612478
\n", + "
" + ], + "text/plain": [ + " a b c d e f\n", + "2020-06-01 NaN -0.959554 -0.367265 1.108948 0 0.149394\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848 0 -1.599308\n", + "2020-06-03 -1.898721 0.919976 0.485630 0.758721 0 1.678697\n", + "2020-06-04 0.486961 0.378323 0.186727 0.671816 0 1.050139\n", + "2020-06-05 0.702523 -0.556798 0.635000 -0.118564 0 -0.675362\n", + "2020-06-06 0.654506 -0.000727 NaN -0.611751 0 -0.612478" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 将所有等于1的值替换成20\n", + "df.replace(1,20)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcdef
2020-06-01NaN-0.959554-0.3672651.10894800.149394
2020-06-02-0.4123620.232540-1.903134-1.8318480-1.599308
2020-06-03-1.8987210.9199760.4856300.75872101.678697
2020-06-040.4869610.3783230.1867270.67181601.050139
2020-06-050.702523-0.5567980.635000-0.1185640-0.675362
2020-06-060.654506-0.000727NaN-0.6117510-0.612478
\n", + "
" + ], + "text/plain": [ + " a b c d e f\n", + "2020-06-01 NaN -0.959554 -0.367265 1.108948 0 0.149394\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848 0 -1.599308\n", + "2020-06-03 -1.898721 0.919976 0.485630 0.758721 0 1.678697\n", + "2020-06-04 0.486961 0.378323 0.186727 0.671816 0 1.050139\n", + "2020-06-05 0.702523 -0.556798 0.635000 -0.118564 0 -0.675362\n", + "2020-06-06 0.654506 -0.000727 NaN -0.611751 0 -0.612478" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 使用one替换1,three替换3\n", + "df.replace([1,3],['one','three'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 列名重命名" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abccdef
2020-06-01NaN-0.959554-0.3672651.10894800.149394
2020-06-02-0.4123620.232540-1.903134-1.8318480-1.599308
2020-06-03-1.8987210.9199760.4856300.75872101.678697
2020-06-040.4869610.3783230.1867270.67181601.050139
2020-06-050.702523-0.5567980.635000-0.1185640-0.675362
2020-06-060.654506-0.000727NaN-0.6117510-0.612478
\n", + "
" + ], + "text/plain": [ + " a b cc d e f\n", + "2020-06-01 NaN -0.959554 -0.367265 1.108948 0 0.149394\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848 0 -1.599308\n", + "2020-06-03 -1.898721 0.919976 0.485630 0.758721 0 1.678697\n", + "2020-06-04 0.486961 0.378323 0.186727 0.671816 0 1.050139\n", + "2020-06-05 0.702523 -0.556798 0.635000 -0.118564 0 -0.675362\n", + "2020-06-06 0.654506 -0.000727 NaN -0.611751 0 -0.612478" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.rename(columns={'c':'cc'})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 重设索引" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bcdef
a
NaN-0.959554-0.3672651.10894800.149394
-0.4123620.232540-1.903134-1.8318480-1.599308
-1.8987210.9199760.4856300.75872101.678697
0.4869610.3783230.1867270.67181601.050139
0.702523-0.5567980.635000-0.1185640-0.675362
0.654506-0.000727NaN-0.6117510-0.612478
\n", + "
" + ], + "text/plain": [ + " b c d e f\n", + "a \n", + " NaN -0.959554 -0.367265 1.108948 0 0.149394\n", + "-0.412362 0.232540 -1.903134 -1.831848 0 -1.599308\n", + "-1.898721 0.919976 0.485630 0.758721 0 1.678697\n", + " 0.486961 0.378323 0.186727 0.671816 0 1.050139\n", + " 0.702523 -0.556798 0.635000 -0.118564 0 -0.675362\n", + " 0.654506 -0.000727 NaN -0.611751 0 -0.612478" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 将a设置为索引\n", + "df.set_index('a')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 删除列" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bcde
2020-06-01-0.959554-0.3672651.1089480
2020-06-020.232540-1.903134-1.8318480
2020-06-030.9199760.4856300.7587210
2020-06-040.3783230.1867270.6718160
2020-06-05-0.5567980.635000-0.1185640
2020-06-06-0.000727NaN-0.6117510
\n", + "
" + ], + "text/plain": [ + " b c d e\n", + "2020-06-01 -0.959554 -0.367265 1.108948 0\n", + "2020-06-02 0.232540 -1.903134 -1.831848 0\n", + "2020-06-03 0.919976 0.485630 0.758721 0\n", + "2020-06-04 0.378323 0.186727 0.671816 0\n", + "2020-06-05 -0.556798 0.635000 -0.118564 0\n", + "2020-06-06 -0.000727 NaN -0.611751 0" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.drop(columns=['a', 'f'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 处理Nan数据" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 检查是否Nan值" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcdef
2020-06-01TrueFalseFalseFalseFalseFalse
2020-06-02FalseFalseFalseFalseFalseFalse
2020-06-03FalseFalseFalseFalseFalseFalse
2020-06-04FalseFalseFalseFalseFalseFalse
2020-06-05FalseFalseFalseFalseFalseFalse
2020-06-06FalseFalseTrueFalseFalseFalse
\n", + "
" + ], + "text/plain": [ + " a b c d e f\n", + "2020-06-01 True False False False False False\n", + "2020-06-02 False False False False False False\n", + "2020-06-03 False False False False False False\n", + "2020-06-04 False False False False False False\n", + "2020-06-05 False False False False False False\n", + "2020-06-06 False False True False False False" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcdef
2020-06-01FalseTrueTrueTrueTrueTrue
2020-06-02TrueTrueTrueTrueTrueTrue
2020-06-03TrueTrueTrueTrueTrueTrue
2020-06-04TrueTrueTrueTrueTrueTrue
2020-06-05TrueTrueTrueTrueTrueTrue
2020-06-06TrueTrueFalseTrueTrueTrue
\n", + "
" + ], + "text/plain": [ + " a b c d e f\n", + "2020-06-01 False True True True True True\n", + "2020-06-02 True True True True True True\n", + "2020-06-03 True True True True True True\n", + "2020-06-04 True True True True True True\n", + "2020-06-05 True True True True True True\n", + "2020-06-06 True True False True True True" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.notnull() # df.isnull()反操作" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 删除掉包含null值的行" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcdef
2020-06-02-0.4123620.232540-1.903134-1.8318480-1.599308
2020-06-03-1.8987210.9199760.4856300.75872101.678697
2020-06-040.4869610.3783230.1867270.67181601.050139
2020-06-050.702523-0.5567980.635000-0.1185640-0.675362
\n", + "
" + ], + "text/plain": [ + " a b c d e f\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848 0 -1.599308\n", + "2020-06-03 -1.898721 0.919976 0.485630 0.758721 0 1.678697\n", + "2020-06-04 0.486961 0.378323 0.186727 0.671816 0 1.050139\n", + "2020-06-05 0.702523 -0.556798 0.635000 -0.118564 0 -0.675362" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### dropna(axis=, how=):丢弃NaN数据,\n", + "# axis:0-按行丢弃),1-按列丢弃; how:'any'-只要含有NaN数据就丢弃,'all'-所有数据都为NaN时丢弃\n", + "\n", + "df.dropna(axis=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 替换Nan" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcdef
2020-06-011000.000000-0.959554-0.3672651.10894800.149394
2020-06-02-0.4123620.232540-1.903134-1.8318480-1.599308
2020-06-03-1.8987210.9199760.4856300.75872101.678697
2020-06-040.4869610.3783230.1867270.67181601.050139
2020-06-050.702523-0.5567980.635000-0.1185640-0.675362
2020-06-060.654506-0.0007271000.000000-0.6117510-0.612478
\n", + "
" + ], + "text/plain": [ + " a b c d e f\n", + "2020-06-01 1000.000000 -0.959554 -0.367265 1.108948 0 0.149394\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848 0 -1.599308\n", + "2020-06-03 -1.898721 0.919976 0.485630 0.758721 0 1.678697\n", + "2020-06-04 0.486961 0.378323 0.186727 0.671816 0 1.050139\n", + "2020-06-05 0.702523 -0.556798 0.635000 -0.118564 0 -0.675362\n", + "2020-06-06 0.654506 -0.000727 1000.000000 -0.611751 0 -0.612478" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#### 使用1000替换Nan\n", + "df.fillna(1000)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcdef
2020-06-01-0.093418-0.959554-0.3672651.10894800.149394
2020-06-02-0.4123620.232540-1.903134-1.8318480-1.599308
2020-06-03-1.8987210.9199760.4856300.75872101.678697
2020-06-040.4869610.3783230.1867270.67181601.050139
2020-06-050.702523-0.5567980.635000-0.1185640-0.675362
2020-06-060.654506-0.000727-0.192608-0.6117510-0.612478
\n", + "
" + ], + "text/plain": [ + " a b c d e f\n", + "2020-06-01 -0.093418 -0.959554 -0.367265 1.108948 0 0.149394\n", + "2020-06-02 -0.412362 0.232540 -1.903134 -1.831848 0 -1.599308\n", + "2020-06-03 -1.898721 0.919976 0.485630 0.758721 0 1.678697\n", + "2020-06-04 0.486961 0.378323 0.186727 0.671816 0 1.050139\n", + "2020-06-05 0.702523 -0.556798 0.635000 -0.118564 0 -0.675362\n", + "2020-06-06 0.654506 -0.000727 -0.192608 -0.611751 0 -0.612478" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 使用平均值替换所有null值\n", + "df.fillna(df.mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 聚合、分组、统计" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 返回每一列的平均数" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "a -0.093418\n", + "b 0.002293\n", + "c -0.192608\n", + "d -0.003780\n", + "e 0.000000\n", + "f -0.001486\n", + "dtype: float64" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 返回列之间的相关性" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcdef
a1.000000-0.8210880.055410-0.201634NaN-0.486299
b-0.8210881.0000000.024617-0.127603NaN0.441503
c0.0554100.0246171.0000000.743462NaN0.682118
d-0.201634-0.1276030.7434621.000000NaN0.833588
eNaNNaNNaNNaNNaNNaN
f-0.4862990.4415030.6821180.833588NaN1.000000
\n", + "
" + ], + "text/plain": [ + " a b c d e f\n", + "a 1.000000 -0.821088 0.055410 -0.201634 NaN -0.486299\n", + "b -0.821088 1.000000 0.024617 -0.127603 NaN 0.441503\n", + "c 0.055410 0.024617 1.000000 0.743462 NaN 0.682118\n", + "d -0.201634 -0.127603 0.743462 1.000000 NaN 0.833588\n", + "e NaN NaN NaN NaN NaN NaN\n", + "f -0.486299 0.441503 0.682118 0.833588 NaN 1.000000" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.corr()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 返回每一列中非null值数量" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "a 5\n", + "b 6\n", + "c 5\n", + "d 6\n", + "e 6\n", + "f 6\n", + "dtype: int64" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 返回每一列中最大值" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "a 0.702523\n", + "b 0.919976\n", + "c 0.635000\n", + "d 1.108948\n", + "e 0.000000\n", + "f 1.678697\n", + "dtype: float64" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.max()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 返回每一列中最小值" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "a -1.898721\n", + "b -0.959554\n", + "c -1.903134\n", + "d -1.831848\n", + "e 0.000000\n", + "f -1.599308\n", + "dtype: float64" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.min()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 返回每一列的中值" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "a 0.486961\n", + "b 0.115907\n", + "c 0.186727\n", + "d 0.276626\n", + "e 0.000000\n", + "f -0.231542\n", + "dtype: float64" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.median()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 返回每一列的标准偏差" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "a 1.105735\n", + "b 0.674432\n", + "c 1.030199\n", + "d 1.095503\n", + "e 0.000000\n", + "f 1.210962\n", + "dtype: float64" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.std()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 分组后取TopN" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderscorecountry
1Molly32F94CN
4Amy23F70CN
6Tim27F67CN
2Tina36F57CN
5Jack18F88US
3Jake24F62US
0Jason20F25US
\n", + "
" + ], + "text/plain": [ + " name age gender score country\n", + "1 Molly 32 F 94 CN\n", + "4 Amy 23 F 70 CN\n", + "6 Tim 27 F 67 CN\n", + "2 Tina 36 F 57 CN\n", + "5 Jack 18 F 88 US\n", + "3 Jake 24 F 62 US\n", + "0 Jason 20 F 25 US" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### 取每个国家下,分值前二的记录\n", + "\n", + "# 先排序\n", + "df4 = df3.sort_values(['country','score'],ascending=[1, 0],inplace=False)\n", + "df4" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderscorecountry
1Molly32F94CN
4Amy23F70CN
5Jack18F88US
3Jake24F62US
\n", + "
" + ], + "text/plain": [ + " name age gender score country\n", + "1 Molly 32 F 94 CN\n", + "4 Amy 23 F 70 CN\n", + "5 Jack 18 F 88 US\n", + "3 Jake 24 F 62 US" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 取值\n", + "df4.groupby(['country']).head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 多重分组后取TopN" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderscorecountry
1Molly32F94CN
4Amy23F70CN
6Tim27F67CN
2Tina36F57CN
5Jack18F88US
3Jake24F62US
0Jason20F25US
\n", + "
" + ], + "text/plain": [ + " name age gender score country\n", + "1 Molly 32 F 94 CN\n", + "4 Amy 23 F 70 CN\n", + "6 Tim 27 F 67 CN\n", + "2 Tina 36 F 57 CN\n", + "5 Jack 18 F 88 US\n", + "3 Jake 24 F 62 US\n", + "0 Jason 20 F 25 US" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### 取每个国家下,分值前二的记录\n", + "\n", + "# 先排序\n", + "df5 = df3.sort_values(['country','gender', 'score'],ascending=[1, 0, 0],inplace=False)\n", + "df5" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderscorecountry
1Molly32F94CN
5Jack18F88US
\n", + "
" + ], + "text/plain": [ + " name age gender score country\n", + "1 Molly 32 F 94 CN\n", + "5 Jack 18 F 88 US" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df5 = df5.groupby(['country', 'gender']).head(1) # 注意此处取1\n", + "df5" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderscorecountry
1Molly32F94CN
5Jack18F88US
\n", + "
" + ], + "text/plain": [ + " name age gender score country\n", + "1 Molly 32 F 94 CN\n", + "5 Jack 18 F 88 US" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df5.groupby(['country']).head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 分组之后取平均值" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
score
gender
F66.142857
\n", + "
" + ], + "text/plain": [ + " score\n", + "gender \n", + "F 66.142857" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scoreMean = df3.groupby(['gender'])['score'].mean()\n", + "scoreMean = pd.DataFrame(scoreMean) # 等效于socreMean = scoreMean.to_frame()\n", + "scoreMean" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderscore_xcountryscore_y
0Jason20F25US66.142857
1Molly32F94CN66.142857
2Tina36F57CN66.142857
3Jake24F62US66.142857
4Amy23F70CN66.142857
5Jack18F88US66.142857
6Tim27F67CN66.142857
\n", + "
" + ], + "text/plain": [ + " name age gender score_x country score_y\n", + "0 Jason 20 F 25 US 66.142857\n", + "1 Molly 32 F 94 CN 66.142857\n", + "2 Tina 36 F 57 CN 66.142857\n", + "3 Jake 24 F 62 US 66.142857\n", + "4 Amy 23 F 70 CN 66.142857\n", + "5 Jack 18 F 88 US 66.142857\n", + "6 Tim 27 F 67 CN 66.142857" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#### 合并\n", + "df3.merge(scoreMean,left_on='gender',right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderscorecountry
0Jason20F25US
1Molly32F94CN
2Tina36F57CN
3Jake24F62US
4Amy23F70CN
5Jack18F88US
6Tim27F67CN
\n", + "
" + ], + "text/plain": [ + " name age gender score country\n", + "0 Jason 20 F 25 US\n", + "1 Molly 32 F 94 CN\n", + "2 Tina 36 F 57 CN\n", + "3 Jake 24 F 62 US\n", + "4 Amy 23 F 70 CN\n", + "5 Jack 18 F 88 US\n", + "6 Tim 27 F 67 CN" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 分组之后计数" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gender
country
CN4
US3
\n", + "
" + ], + "text/plain": [ + " gender\n", + "country \n", + "CN 4\n", + "US 3" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.groupby(['country'])['gender'].count().to_frame()" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gender
countrygender
CNF4
USF3
\n", + "
" + ], + "text/plain": [ + " gender\n", + "country gender \n", + "CN F 4\n", + "US F 3" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### 按性别统计每个国家的人数\n", + "\n", + "df3.groupby(['country', 'gender'])['gender'].count().to_frame()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 分组后唯一值统计" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gender
country
CN1
US1
\n", + "
" + ], + "text/plain": [ + " gender\n", + "country \n", + "CN 1\n", + "US 1" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.groupby(['country'])['gender'].nunique().to_frame()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 分组后求和" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agescore
country
CN118288
US62175
\n", + "
" + ], + "text/plain": [ + " age score\n", + "country \n", + "CN 118 288\n", + "US 62 175" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 默认是所有数值类型列求和\n", + "df3.groupby('country').sum() " + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "country\n", + "CN 288\n", + "US 175\n", + "Name: score, dtype: int64" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 指定列求和\n", + "df3.groupby('country')['score'].sum() # 等效于df3.groupby(['country'])['score'].apply(np.sum)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 分组后指定列求和(其他聚合也类似)可以理解成Split, apply, combine\n", + "Image(url=\"http://static.cyub.vip/images/202001/pandas.split-apply-combine.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAENCAYAAAD0eSVZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAUtUlEQVR4nO3df5BV5Z3n8fdXISBKVJS4CsamdjD+QsB0MxpNoriC0SrFSkzpjtFx3GAlmjXZrBV0rZWpjFXZSjJWyM6aIgUDMc6Iu46rO/4YxJgYs/FHSwxEIcomKI0oLTGIjpi0fvePPpArNPTtvt195eH9qrp1z3nOc8793qL59NPPPefcyEwkSWXZp9kFSJIGnuEuSQUy3CWpQIa7JBXIcJekAhnuklSgXsM9IkZGxBMR8cuIeCYi/rpqnxARj0fEmohYEhEfqNpHVOtrqu0tg/sWJEk7qmfk/jYwPTMnA1OAsyPiZOC/ATdn5p8BrwFXVP2vAF6r2m+u+kmShlD05SKmiBgFPAp8AbgX+DeZ2RURpwBzM3NmRPxLtfzziBgGvAyMzd280KGHHpotLS2NvA9J2us89dRTr2bm2J62DavnABGxL/AU8GfA3wH/D/h9ZnZVXTqAcdXyOGAdQBX8m4FDgFd3dfyWlhba29vrKUWSVImIF3a1ra4PVDPzncycAowHpgHHDEBRsyOiPSLaOzs7Gz2cJKlGn86WyczfAw8DpwAHVdMu0B3666vl9cCRANX2A4FNPRxrfma2Zmbr2LE9/lUhSeqnes6WGRsRB1XL+wFnAavoDvnPVN0uA+6ulu+p1qm2/2h38+2SpIFXz5z74cDiat59H+COzPzniHgWuD0i/gb4BbCg6r8AuDUi1gC/Ay4ahLol7YX++Mc/0tHRwdatW5tdypAaOXIk48ePZ/jw4XXv02u4Z+YKYGoP7b+he/59x/atwIV1VyBJdero6GD06NG0tLQQEc0uZ0hkJps2baKjo4MJEybUvZ9XqEraY2zdupVDDjlkrwl2gIjgkEMO6fNfK4a7pD3K3hTs2/TnPRvuklSgui5i0h5m7oHNrqA+czc3uwLt4Vrm3Dugx1v7jXMH9HjN5Mhdkvpo1qxZfPSjH+X4449n/vz5ACxYsICjjz6aadOm8fnPf56rr74agM7OTj796U/T1tZGW1sbP/vZz4akRkfuktRHCxcuZMyYMbz11lu0tbVx7rnn8vWvf53ly5czevRopk+fzuTJkwG45ppr+MpXvsJpp53Giy++yMyZM1m1atWg12i4S1IfzZs3j7vuuguAdevWceutt/LJT36SMWPGAHDhhRfy3HPPAbBs2TKeffbZ7fu+/vrrvPHGGxxwwAGDWqPhLkl98OMf/5hly5bx85//nFGjRnH66adzzDHH7HI0/u677/LYY48xcuTIIa3TOXdJ6oPNmzdz8MEHM2rUKFavXs1jjz3Gm2++yU9+8hNee+01urq6uPPOO7f3nzFjBt/97ne3rz/99NNDUqfhLkl9cPbZZ9PV1cWxxx7LnDlzOPnkkxk3bhzXX38906ZN49RTT6WlpYUDD+w+a23evHm0t7dz4oknctxxx/G9731vSOp0WkbSHqsZpy6OGDGC+++/f6f21tZWZs+eTVdXFxdccAGzZs0C4NBDD2XJkiVDXaYjd0kaCHPnzmXKlCmccMIJTJgwYXu4N4sjd0kaAN/61reaXcJ7OHKXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQZ8tI2nMN9O2th/g21F1dXQwbNjgx7MhdkvrgzTff5Nxzz2Xy5MmccMIJLFmyhCeffJKPfexjTJ48mWnTprFlyxa2bt3K5ZdfzqRJk5g6dSoPP/wwAIsWLeK8885j+vTpnHnmmQB885vfpK2tjRNPPJEbb7xxQOp05C5JffDAAw9wxBFHcO+93V8UsnnzZqZOncqSJUtoa2vj9ddfZ7/99uM73/kOEcHKlStZvXo1M2bM2H6nyOXLl7NixQrGjBnD0qVLef7553niiSfITM477zweeeQRPvGJTzRUpyN3SeqDSZMm8eCDD/K1r32Nn/70p7z44oscfvjhtLW1AfDBD36QYcOG8eijj3LJJZcAcMwxx3DUUUdtD/ezzjpr++2Bly5dytKlS5k6dSonnXQSq1ev5vnnn2+4TkfuktQHRx99NMuXL+e+++7jhhtuYPr06X0+xv777799OTO57rrruPLKKweyTEfuktQXL730EqNGjeKSSy7h2muv5fHHH2fDhg08+eSTAGzZsoWuri4+/vGPc9tttwHw3HPP8eKLL/KRj3xkp+PNnDmThQsX8sYbbwCwfv16Nm7c2HCdjtwlqQ9WrlzJtddeyz777MPw4cO55ZZbyEy+9KUv8dZbb7HffvuxbNkyvvjFL/KFL3yBSZMmMWzYMBYtWsSIESN2Ot6MGTNYtWoVp5xyCgAHHHAAP/zhD/nQhz7UUJ2RmQ0dYCC0trZme3t7s8sox0CfHjZYhvi0M+35Vq1axbHHHtvsMpqip/ceEU9lZmtP/Z2WkaQC9RruEXFkRDwcEc9GxDMRcU3VPjci1kfE09XjnJp9rouINRHx64iYOZhvQJK0s3rm3LuAr2bm8ogYDTwVEQ9W227OzPfcxDgijgMuAo4HjgCWRcTRmfnOQBYuSdq1XkfumbkhM5dXy1uAVcC43exyPnB7Zr6dmb8F1gDTBqJYSXo/fE441Prznvs05x4RLcBU4PGq6eqIWBERCyPi4KptHLCuZrcOdv/LQJLqMnLkSDZt2rRXBXxmsmnTJkaOHNmn/eo+FTIiDgDuBL6cma9HxC3A14Gsnr8N/FUfjjcbmA3w4Q9/uC81S9pLjR8/no6ODjo7O5tdypAaOXIk48eP79M+dYV7RAynO9hvy8x/AsjMV2q2fx/452p1PXBkze7jq7b3yMz5wHzoPhWyT1VL2isNHz6cCRMmNLuMPUI9Z8sEsABYlZl/W9N+eE23C4BfVcv3ABdFxIiImABMBJ4YuJIlSb2pZ+R+KvA5YGVEPF21XQ9cHBFT6J6WWQtcCZCZz0TEHcCzdJ9pc5VnykjS0Oo13DPzUSB62HTfbva5CbipgbokSQ3wClVJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUC9hntEHBkRD0fEsxHxTERcU7WPiYgHI+L56vngqj0iYl5ErImIFRFx0mC/CUnSe9Uzcu8CvpqZxwEnA1dFxHHAHOChzJwIPFStA3wKmFg9ZgO3DHjVkqTd6jXcM3NDZi6vlrcAq4BxwPnA4qrbYmBWtXw+8IPs9hhwUEQcPuCVS5J2qU9z7hHRAkwFHgcOy8wN1aaXgcOq5XHAuprdOqo2SdIQqTvcI+IA4E7gy5n5eu22zEwg+/LCETE7Itojor2zs7Mvu0qSelFXuEfEcLqD/bbM/Keq+ZVt0y3V88aqfT1wZM3u46u298jM+ZnZmpmtY8eO7W/9kqQe1HO2TAALgFWZ+bc1m+4BLquWLwPurmm/tDpr5mRgc830jSRpCAyro8+pwOeAlRHxdNV2PfAN4I6IuAJ4Afhste0+4BxgDfCvwOUDWrEkqVe9hntmPgrELjaf2UP/BK5qsC5JUgO8QlWSCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalA9dzyV5IGxtwDm11BfeZubnYFDXPkLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVKBewz0iFkbExoj4VU3b3IhYHxFPV49zarZdFxFrIuLXETFzsAqXJO1aPSP3RcDZPbTfnJlTqsd9ABFxHHARcHy1z/+IiH0HqlhJUn16DffMfAT4XZ3HOx+4PTPfzszfAmuAaQ3UJ0nqh0bm3K+OiBXVtM3BVds4YF1Nn46qTZI0hPob7rcA/xaYAmwAvt3XA0TE7Ihoj4j2zs7OfpYhSepJv8I9M1/JzHcy813g+/xp6mU9cGRN1/FVW0/HmJ+ZrZnZOnbs2P6UIUnahX6Fe0QcXrN6AbDtTJp7gIsiYkRETAAmAk80VqIkqa96/YLsiPhH4HTg0IjoAG4ETo+IKUACa4ErATLzmYi4A3gW6AKuysx3Bqd0SdKu9BrumXlxD80LdtP/JuCmRoqSJDXGK1QlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBWo16/Z05+0zLm32SXUZe3IZlcgqdkcuUtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIK1Gu4R8TCiNgYEb+qaRsTEQ9GxPPV88FVe0TEvIhYExErIuKkwSxektSzekbui4Czd2ibAzyUmROBh6p1gE8BE6vHbOCWgSlTktQXvYZ7Zj4C/G6H5vOBxdXyYmBWTfsPsttjwEERcfhAFStJqk9/59wPy8wN1fLLwGHV8jhgXU2/jqpNkjSEGv5ANTMTyL7uFxGzI6I9Ito7OzsbLUOSVKO/4f7KtumW6nlj1b4eOLKm3/iqbSeZOT8zWzOzdezYsf0sQ5LUk/6G+z3AZdXyZcDdNe2XVmfNnAxsrpm+kSQNkV7v5x4R/wicDhwaER3AjcA3gDsi4grgBeCzVff7gHOANcC/ApcPQs2SpF70Gu6ZefEuNp3ZQ98Ermq0KElSY7xCVZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFWhYswuQ1LiWOfc2u4S6rB3Z7Ar2Ho7cJalAhrskFchwl6QCGe6SVKCGPlCNiLXAFuAdoCszWyNiDLAEaAHWAp/NzNcaK1OS1BcDMXI/IzOnZGZrtT4HeCgzJwIPVeuSpCE0GNMy5wOLq+XFwKxBeA1J0m40Gu4JLI2IpyJidtV2WGZuqJZfBg7raceImB0R7RHR3tnZ2WAZkqRajV7EdFpmro+IDwEPRsTq2o2ZmRGRPe2YmfOB+QCtra099pEk9U9DI/fMXF89bwTuAqYBr0TE4QDV88ZGi5Qk9U2/wz0i9o+I0duWgRnAr4B7gMuqbpcBdzdapCSpbxqZljkMuCsith3nHzLzgYh4ErgjIq4AXgA+23iZkqS+6He4Z+ZvgMk9tG8CzmykKElSY7xCVZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kq0KCFe0ScHRG/jog1ETFnsF5HkrSzQQn3iNgX+DvgU8BxwMURcdxgvJYkaWeDNXKfBqzJzN9k5h+A24HzB+m1JEk7GDZIxx0HrKtZ7wD+vLZDRMwGZlerb0TErweplr1OwKHAq82uo1d/Hc2uQEPMn80Bd9SuNgxWuPcqM+cD85v1+iWLiPbMbG12HdKO/NkcOoM1LbMeOLJmfXzVJkkaAoMV7k8CEyNiQkR8ALgIuGeQXkuStINBmZbJzK6IuBr4F2BfYGFmPjMYr6UeOd2l9yt/NodIZGaza5AkDTCvUJWkAhnuklQgw12SCmS4S1KBmnYRkxoXEZfubntm/mCoapHqEREHA79Pz+QYdJ4tsweLiO/uYtN5wLjM9Je3miYi/itwR2aujogRwAPAZKAL+PeZuaypBRbOcC9ERATwF8DXgGeBmzJzRXOr0t4sIp4BTsjMrO4ldTHw74CjgcWZOa2pBRbOkd0eLiKGAX8J/GfgMeAzmelN2PR+8Iea6ZeZwO2Z+Q6wqvq51SDyA9U9WERcRfco/aPA2Zn5lwa73kfejogTImIscAawtGbbqCbVtNdwWmYPFhHvAhuBTmCnf8jMPHHIi5IqEfHnwGJgLHBzZv5N1X4O8LnMvLiZ9ZXOcN+DRcRE4DDee+986L4j58uZuWboq5K6RcR/ArbdGD2rx6vAo5n526YVtpdwWmbPdjOwOTNfqH0Am6ttUjONBg6oHqOBDwKtwP0RcVEzC9sbOHLfg0XEk5nZtottKzNz0lDXJPUmIsYAyzLzpGbXUjJH7nu2g3azbb8hq0Lqg8z8HX+artEgMdz3bO0R8fkdGyPiPwBPNaEeqVcRcQbwWrPrKJ3TMnuwiDgMuAv4A38K81bgA8AFmflys2qTImIlO5/FNQZ4Cbg0M1cPfVV7D8O9ANVI6IRq9ZnM/FEz65EAIuKoHZoS2JSZbzajnr2N4S5JBXLOXZIKZLhLUoEMd6kfIuLLEeH9UfS+5Zy71A8RsRZozcxXe9i2b3X3Q6lpHLmrWBFxaUSsiIhfRsStEdESET+q2h6KiA9X/RZFxGdq9nujej49In4cEf8rIlZHxG3R7T8CRwAPR8TD2/aJiG9HxC+B/xIR/7vmeGdFxF1D+ua11/OeyipSRBwP3AB8LDNfrS55X0z3l0Qsjoi/AuYBs3o51FTgeLrPzf4ZcGpmzqtuinVGzch9f+DxzPxq9cUpqyJibGZ2ApcDCwf8TUq74chdpZoO/M9t4Vtd8n4K8A/V9luB0+o4zhOZ2ZGZ7wJPAy276PcOcGf1Wlkd/5KIOKh63fv7+T6kfnHkLnV/p+c+ABGxD91X+G7zds3yO+z6/8zWHebZ/x74P8BWun/JdA1cuVLvHLmrVD8CLoyIQ2D7nQj/L7DtVrN/Afy0Wl5L97dZQfeXiw+v4/hb6L6NbY8y8yW6p3JuoDvopSHlyF1FysxnIuIm4CcR8Q7wC+BLwN9HxLV0f3vV5VX37wN3Vx+GPgDUc3n8fOCBiHgpM8/YRZ/bgLGZuaqR9yL1h6dCSoMkIv478IvMXNDsWrT3MdylQRART9H9F8BZmfl2b/2lgWa4S1KB/EBVkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFej/A3tgqEh0rq73AAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.clf()\n", + "df3.groupby('country').sum().plot(kind='bar')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAENCAYAAAD0eSVZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAP8ElEQVR4nO3df6xfdX3H8ecLqswfbIDcdVg6L9GaBdws7g5x+gfoFMRkxURJmZPK2GoW3DRzy9CZ6RJJMJmSsDmSGtBqUOymjG4iioXNsU3wVrFQCrHTEloLvQgizohpee+Pezq+lHt7f3zv937pp89H8s33nM/nc855f8PldU8/95zzTVUhSWrLEcMuQJK08Ax3SWqQ4S5JDTLcJalBhrskNchwl6QGzRjuSX4hye1JvpNka5K/6dpPSnJbku1JPp/k2V37Ud369q5/dLAfQZJ0oNmcuT8OvLaqXg6sBM5OcjrwEeDyqnoJ8AhwUTf+IuCRrv3ybpwkaRFlLjcxJXkucCvwx8CXgF+pqr1JXgV8qKrOSvKVbvm/kywBHgBG6iAHOv7442t0dLSfzyFJh53Nmzc/VFUjU/Utmc0OkhwJbAZeAnwc+B/gR1W1txuyE1jWLS8D7gfogv9R4AXAQ9Ptf3R0lPHx8dmUIknqJLlvur5Z/UG1qvZV1UrgROA04NcWoKi1ScaTjE9MTPS7O0lSjzldLVNVPwJuAV4FHNNNu8Bk6O/qlncBywG6/l8CfjjFvtZV1VhVjY2MTPmvCknSPM3mapmRJMd0y88BXg9sYzLk39INWwNc3y1v7Nbp+m8+2Hy7JGnhzWbO/QRgfTfvfgSwoar+NcndwLVJPgx8G7iqG38V8Jkk24GHgdUDqFuSdBAzhntVbQFOnaL9e0zOvx/Y/jPgrQtSnSRpXrxDVZIaZLhLUoMMd0lq0KxuYtKk0Uu+NOwSmrLjsjcNuwSpWZ65S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktSgGcM9yfIktyS5O8nWJO/u2j+UZFeSO7rXOT3bvC/J9iT3JjlrkB9AkvR0S2YxZi/w3qr6VpKjgc1Jbur6Lq+qv+0dnORkYDVwCvBC4GtJXlpV+xaycEnS9GY8c6+q3VX1rW75MWAbsOwgm6wCrq2qx6vq+8B24LSFKFaSNDtzmnNPMgqcCtzWNb0ryZYkVyc5tmtbBtzfs9lODv7LQJK0wGYd7kmeD3wBeE9V/Ri4EngxsBLYDXx0LgdOsjbJeJLxiYmJuWwqSZrBrMI9ybOYDPZrquqLAFX1YFXtq6ongE/w5NTLLmB5z+Yndm1PUVXrqmqsqsZGRkb6+QySpAPM5mqZAFcB26rqYz3tJ/QMezNwV7e8EVid5KgkJwErgNsXrmRJ0kxmc7XMq4G3A3cmuaNrez9wfpKVQAE7gHcCVNXWJBuAu5m80uZir5SRpMU1Y7hX1a1Apui64SDbXApc2kddkqQ+eIeqJDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktSgGcM9yfIktyS5O8nWJO/u2o9LclOS73bvx3btSXJFku1JtiR5xaA/hCTpqWZz5r4XeG9VnQycDlyc5GTgEmBTVa0ANnXrAG8EVnSvtcCVC161JOmgZgz3qtpdVd/qlh8DtgHLgFXA+m7YeuDcbnkV8Oma9A3gmCQnLHjlkqRpzWnOPckocCpwG7C0qnZ3XQ8AS7vlZcD9PZvt7NokSYtk1uGe5PnAF4D3VNWPe/uqqoCay4GTrE0ynmR8YmJiLptKkmYwq3BP8iwmg/2aqvpi1/zg/umW7n1P174LWN6z+Yld21NU1bqqGquqsZGRkfnWL0mawmyulglwFbCtqj7W07URWNMtrwGu72m/oLtq5nTg0Z7pG0nSIlgyizGvBt4O3Jnkjq7t/cBlwIYkFwH3Aed1fTcA5wDbgZ8CFy5oxZKkGc0Y7lV1K5Bpul83xfgCLu6zLklSH7xDVZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGzeeSvpGe40Uu+NOwSmrLjsjcNu4S+eeYuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUoBnDPcnVSfYkuaun7UNJdiW5o3ud09P3viTbk9yb5KxBFS5Jmt5sztw/BZw9RfvlVbWye90AkORkYDVwSrfNPyQ5cqGKlSTNzozhXlVfBx6e5f5WAddW1eNV9X1gO3BaH/VJkuahnzn3dyXZ0k3bHNu1LQPu7xmzs2uTJC2i+Yb7lcCLgZXAbuCjc91BkrVJxpOMT0xMzLMMSdJU5hXuVfVgVe2rqieAT/Dk1MsuYHnP0BO7tqn2sa6qxqpqbGRkZD5lSJKmMa9wT3JCz+qbgf1X0mwEVic5KslJwArg9v5KlCTN1YxfkJ3kc8AZwPFJdgIfBM5IshIoYAfwToCq2ppkA3A3sBe4uKr2DaZ0SdJ0Zgz3qjp/iuarDjL+UuDSfoqSJPXHO1QlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAbNGO5Jrk6yJ8ldPW3HJbkpyXe792O79iS5Isn2JFuSvGKQxUuSpjabM/dPAWcf0HYJsKmqVgCbunWANwIrutda4MqFKVOSNBczhntVfR14+IDmVcD6bnk9cG5P+6dr0jeAY5KcsFDFSpJmZ75z7kurane3/ACwtFteBtzfM25n1yZJWkR9/0G1qgqouW6XZG2S8STjExMT/ZYhSeox33B/cP90S/e+p2vfBSzvGXdi1/Y0VbWuqsaqamxkZGSeZUiSpjLfcN8IrOmW1wDX97Rf0F01czrwaM/0jSRpkSyZaUCSzwFnAMcn2Ql8ELgM2JDkIuA+4Lxu+A3AOcB24KfAhQOoWZI0gxnDvarOn6brdVOMLeDifouSJPXHO1QlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYt6WfjJDuAx4B9wN6qGktyHPB5YBTYAZxXVY/0V6YkaS4W4sz9zKpaWVVj3folwKaqWgFs6tYlSYtoENMyq4D13fJ64NwBHEOSdBD9hnsBX02yOcnarm1pVe3ulh8Alk61YZK1ScaTjE9MTPRZhiSpV19z7sBrqmpXkl8GbkpyT29nVVWSmmrDqloHrAMYGxubcowkaX76OnOvql3d+x7gOuA04MEkJwB073v6LVKSNDfzDvckz0ty9P5l4A3AXcBGYE03bA1wfb9FSpLmpp9pmaXAdUn27+ezVXVjkm8CG5JcBNwHnNd/mZKkuZh3uFfV94CXT9H+Q+B1/RQlSeqPd6hKUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0aWLgnOTvJvUm2J7lkUMeRJD3dQMI9yZHAx4E3AicD5yc5eRDHkiQ93aDO3E8DtlfV96rq58C1wKoBHUuSdIAlA9rvMuD+nvWdwCt7ByRZC6ztVn+S5N4B1XI4Oh54aNhFzCQfGXYFGgJ/NhfWi6brGFS4z6iq1gHrhnX8liUZr6qxYdchHcifzcUzqGmZXcDynvUTuzZJ0iIYVLh/E1iR5KQkzwZWAxsHdCxJ0gEGMi1TVXuTvAv4CnAkcHVVbR3EsTQlp7v0TOXP5iJJVQ27BknSAvMOVUlqkOEuSQ0y3CWpQYa7JDVoaDcxqX9JLjhYf1V9erFqkWYjybHAj8orOQbOq2UOYUn+bpqu3wWWVZW/vDU0Sf4a2FBV9yQ5CrgReDmwF/i9qvraUAtsnOHeiCQB3gb8JXA3cGlVbRluVTqcJdkKvKyqqnuW1PnA7wAvBdZX1WlDLbBxntkd4pIsAd4B/DnwDeAtVeVD2PRM8POe6ZezgGurah+wrfu51QD5B9VDWJKLmTxL/03g7Kp6h8GuZ5DHk7wsyQhwJvDVnr7nDqmmw4bTMoewJE8Ae4AJ4Gn/IavqNxa9KKmT5JXAemAEuLyqPty1nwO8varOH2Z9rTPcD2FJVgBLeeqz82HyiZwPVNX2xa9KmpTkz4B0q9W9HgJurarvD62ww4TTMoe2y4FHq+q+3hfwaNcnDdPRwPO719HALwJjwJeTrB5mYYcDz9wPYUm+WVW/NU3fnVX164tdkzSTJMcBX6uqVwy7lpZ55n5oO+Ygfc9ZtCqkOaiqh3lyukYDYrgf2saT/NGBjUn+ENg8hHqkGSU5E3hk2HW0zmmZQ1iSpcB1wM95MszHgGcDb66qB4ZVm5TkTp5+FddxwA+AC6rqnsWv6vBhuDegOxN6Wbe6tapuHmY9EkCSFx3QVMAPq+p/h1HP4cZwl6QGOecuSQ0y3CWpQYa7NA9J3pPE56PoGcs5d2kekuwAxqrqoSn6juyefigNjWfualaSC5JsSfKdJJ9JMprk5q5tU5Jf7cZ9Kslberb7Sfd+RpJ/S/JPSe5Jck0m/SnwQuCWJLfs3ybJR5N8B/irJP/cs7/XJ7luUT+8Dns+U1lNSnIK8AHgt6vqoe6W9/VMfknE+iR/AFwBnDvDrk4FTmHy2uz/BF5dVVd0D8U6s+fM/XnAbVX13u6LU7YlGamqCeBC4OoF/5DSQXjmrla9FvjH/eHb3fL+KuCzXf9ngNfMYj+3V9XOqnoCuAMYnWbcPuAL3bGq2//vJzmmO+6X5/k5pHnxzF2a/E7PIwCSHMHkHb77Pd6zvI/p/5/52QHz7J8E/gX4GZO/ZPYuXLnSzDxzV6tuBt6a5AXw/08i/C9g/6Nm3wb8R7e8g8lvs4LJLxd/1iz2/xiTj7GdUlX9gMmpnA8wGfTSovLMXU2qqq1JLgX+Pck+4NvAnwCfTPIXTH571YXd8E8A13d/DL0RmM3t8euAG5P8oKrOnGbMNcBIVW3r57NI8+GlkNKAJPl74NtVddWwa9Hhx3CXBiDJZib/BfD6qnp8pvHSQjPcJalB/kFVkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNej/AIl4KMTyIG3lAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df3.groupby('country')['score'].sum().plot(kind='bar')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 分组后求平均值,最大值,最小值" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
score
minmaxmean
country
CN579472.000000
US258858.333333
\n", + "
" + ], + "text/plain": [ + " score \n", + " min max mean\n", + "country \n", + "CN 57 94 72.000000\n", + "US 25 88 58.333333" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.groupby('country').agg({'score':['min','max','mean']})" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
aminamaxmean
country
CN579472.000000
US258858.333333
\n", + "
" + ], + "text/plain": [ + " amin amax mean\n", + "country \n", + "CN 57 94 72.000000\n", + "US 25 88 58.333333" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 跟上面效果一致\n", + "df3.groupby('country')['score'].agg([np.min, np.max, np.mean])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 分组后不同列使用不同求值函数" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
scoreage
maxminstdsumcountmax
country
CN945715.684387118436
US882531.65964862324
\n", + "
" + ], + "text/plain": [ + " score age \n", + " max min std sum count max\n", + "country \n", + "CN 94 57 15.684387 118 4 36\n", + "US 88 25 31.659648 62 3 24" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.groupby('country').agg({'score': ['max','min', 'std'],\n", + " 'age': ['sum', 'count', 'max']})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 多个分组结果拼接" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
scoreage
country
CN72.000000118
US58.33333362
\n", + "
" + ], + "text/plain": [ + " score age\n", + "country \n", + "CN 72.000000 118\n", + "US 58.333333 62" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t1=df3.groupby('country')['score'].mean().to_frame()\n", + "t2 = df3.groupby('country')['age'].sum().to_frame()\n", + "\n", + "t1.merge(t2,left_index=True,right_index=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 遍历分组" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CN\n", + " name age gender score country\n", + "1 Molly 32 F 94 CN\n", + "2 Tina 36 F 57 CN\n", + "4 Amy 23 F 70 CN\n", + "6 Tim 27 F 67 CN\n", + "US\n", + " name age gender score country\n", + "0 Jason 20 F 25 US\n", + "3 Jake 24 F 62 US\n", + "5 Jack 18 F 88 US\n" + ] + } + ], + "source": [ + "grouped = df3.groupby('country')\n", + "for name,group in grouped:\n", + " print(name)\n", + " print(group)" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('CN', 'F')\n", + " name age gender score country\n", + "1 Molly 32 F 94 CN\n", + "2 Tina 36 F 57 CN\n", + "4 Amy 23 F 70 CN\n", + "6 Tim 27 F 67 CN\n", + "('US', 'F')\n", + " name age gender score country\n", + "0 Jason 20 F 25 US\n", + "3 Jake 24 F 62 US\n", + "5 Jack 18 F 88 US\n" + ] + } + ], + "source": [ + "grouped = df3.groupby(['country', 'gender'])\n", + "for name,group in grouped:\n", + " print(name)\n", + " print(group)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 获取分组信息" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'CN': Int64Index([1, 2, 4, 6], dtype='int64'),\n", + " 'US': Int64Index([0, 3, 5], dtype='int64')}" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.groupby('country').groups" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 取分组后的某一组" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderscorecountry
1Molly32F94CN
2Tina36F57CN
4Amy23F70CN
6Tim27F67CN
\n", + "
" + ], + "text/plain": [ + " name age gender score country\n", + "1 Molly 32 F 94 CN\n", + "2 Tina 36 F 57 CN\n", + "4 Amy 23 F 70 CN\n", + "6 Tim 27 F 67 CN" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.groupby('country').get_group('CN')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 分组后过滤" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderscorecountry
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [name, age, gender, score, country]\n", + "Index: []" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3.groupby('name').filter(lambda x: len(x) >= 3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 数据透视" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
scoreage
genderFF
name
Amy7023
Jack8818
Jake6224
Jason2520
Molly9432
Tim6727
Tina5736
\n", + "
" + ], + "text/plain": [ + " score age\n", + "gender F F\n", + "name \n", + "Amy 70 23\n", + "Jack 88 18\n", + "Jake 62 24\n", + "Jason 25 20\n", + "Molly 94 32\n", + "Tim 67 27\n", + "Tina 57 36" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 数据透视的值项只能是数值类型\n", + "# pivot(index =,columns=,values=):透视数据\n", + "# index:透视的列(作为索引, 且值都是唯一的); columns-用于进一步细分index;values查看具体值\n", + "\n", + "df3.pivot(index ='name',columns='gender',values=['score','age'])" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
score
countrygender
CNF288
USF175
\n", + "
" + ], + "text/plain": [ + " score\n", + "country gender \n", + "CN F 288\n", + "US F 175" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# pivot_table(index =,columns=,values=):透视数据\n", + "# index:透视的列(作为索引, 且值都是唯一的); columns-用于进一步细分index;values查看具体值;fill_value:0-用0替换Nan; margins:True-汇总\n", + "\n", + "pd.pivot_table(df3,index=['country', 'gender'], values=['score'],aggfunc=np.sum)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
summean
agescoreagescore
countrygender
CNF11828829.50000072.000000
USF6217520.66666758.333333
All18046325.71428666.142857
\n", + "
" + ], + "text/plain": [ + " sum mean \n", + " age score age score\n", + "country gender \n", + "CN F 118 288 29.500000 72.000000\n", + "US F 62 175 20.666667 58.333333\n", + "All 180 463 25.714286 66.142857" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.pivot_table(df3,index=['country', 'gender'], values=['score', 'age'],aggfunc=[np.sum, np.mean],fill_value=0,margins=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderscorecountry
0Jason20F25US
1Molly32F94CN
2Tina36F57CN
3Jake24F62US
4Amy23F70CN
5Jack18F88US
6Tim27F67CN
\n", + "
" + ], + "text/plain": [ + " name age gender score country\n", + "0 Jason 20 F 25 US\n", + "1 Molly 32 F 94 CN\n", + "2 Tina 36 F 57 CN\n", + "3 Jake 24 F 62 US\n", + "4 Amy 23 F 70 CN\n", + "5 Jack 18 F 88 US\n", + "6 Tim 27 F 67 CN" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 合并、连接、拼接(Merge, join, and concatenate)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 拼接(concatenate)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-----t1----\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
0A0B0C0D0
1A1B1C1D1
2A2B2C2D2
3A3B3C3D3
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "0 A0 B0 C0 D0\n", + "1 A1 B1 C1 D1\n", + "2 A2 B2 C2 D2\n", + "3 A3 B3 C3 D3" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----t2-----\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
4A4B4C4D4
5A5B5C5D5
6A6B6C6D6
7A7B7C7D7
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "4 A4 B4 C4 D4\n", + "5 A5 B5 C5 D5\n", + "6 A6 B6 C6 D6\n", + "7 A7 B7 C7 D7" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-----t3----\n", + " A B C D\n", + "4 A4 B4 C4 D4\n", + "5 A5 B5 C5 D5\n", + "6 A6 B6 C6 D6\n", + "7 A7 B7 C7 D7\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
0A0B0C0D0
1A1B1C1D1
2A2B2C2D2
3A3B3C3D3
4A4B4C4D4
5A5B5C5D5
6A6B6C6D6
7A7B7C7D7
8A8B8C8D8
9A9B9C9D9
10A10B10C10D10
11A11B11C11D11
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "0 A0 B0 C0 D0\n", + "1 A1 B1 C1 D1\n", + "2 A2 B2 C2 D2\n", + "3 A3 B3 C3 D3\n", + "4 A4 B4 C4 D4\n", + "5 A5 B5 C5 D5\n", + "6 A6 B6 C6 D6\n", + "7 A7 B7 C7 D7\n", + "8 A8 B8 C8 D8\n", + "9 A9 B9 C9 D9\n", + "10 A10 B10 C10 D10\n", + "11 A11 B11 C11 D11" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],\n", + " 'B': ['B0', 'B1', 'B2', 'B3'],\n", + " 'C': ['C0', 'C1', 'C2', 'C3'],\n", + " 'D': ['D0', 'D1', 'D2', 'D3']},\n", + " index=[0, 1, 2, 3])\n", + "print('-----t1----')\n", + "display(t1)\n", + "\n", + "t2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],\n", + " 'B': ['B4', 'B5', 'B6', 'B7'],\n", + " 'C': ['C4', 'C5', 'C6', 'C7'],\n", + " 'D': ['D4', 'D5', 'D6', 'D7']},\n", + " index=[4, 5, 6, 7])\n", + "\n", + "print('----t2-----')\n", + "display(t2)\n", + "\n", + "t3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],\n", + " 'B': ['B8', 'B9', 'B10', 'B11'],\n", + " 'C': ['C8', 'C9', 'C10', 'C11'],\n", + " 'D': ['D8', 'D9', 'D10', 'D11']},\n", + " index=[8, 9, 10, 11])\n", + "\n", + "print('-----t3----')\n", + "print(t2)\n", + "frames = [t1, t2, t3]\n", + "\n", + "pd.concat(frames)" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# concat类似:linux的split命令把文件分成多个,然后在拼接成一个完成文件\n", + "\n", + "Image(url=\"http://static.cyub.vip/images/202001/pandas.concat.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-----t4----\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDBDF
0A0B0C0D0NaNNaNNaN
1A1B1C1D1NaNNaNNaN
2A2B2C2D2B2D2F2
3A3B3C3D3B3D3F3
6NaNNaNNaNNaNB6D6F6
7NaNNaNNaNNaNB7D7F7
\n", + "
" + ], + "text/plain": [ + " A B C D B D F\n", + "0 A0 B0 C0 D0 NaN NaN NaN\n", + "1 A1 B1 C1 D1 NaN NaN NaN\n", + "2 A2 B2 C2 D2 B2 D2 F2\n", + "3 A3 B3 C3 D3 B3 D3 F3\n", + "6 NaN NaN NaN NaN B6 D6 F6\n", + "7 NaN NaN NaN NaN B7 D7 F7" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'],\n", + " 'D': ['D2', 'D3', 'D6', 'D7'],\n", + " 'F': ['F2', 'F3', 'F6', 'F7']},\n", + " index=[2, 3, 6, 7])\n", + "\n", + "print('-----t4----')\n", + "\n", + "pd.concat([t1, t4], axis=1, sort=False) # 此时相当于out joiner" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Image(url=\"http://static.cyub.vip/images/202001/pandas.concat.outer_join.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDBDF
2A2B2C2D2B2D2F2
3A3B3C3D3B3D3F3
\n", + "
" + ], + "text/plain": [ + " A B C D B D F\n", + "2 A2 B2 C2 D2 B2 D2 F2\n", + "3 A3 B3 C3 D3 B3 D3 F3" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([t1, t4], axis=1, join='inner')" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Image(url=\"http://static.cyub.vip/images/202001/pandas.concat.inner_join.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
0A0B0C0D0
1A1B1C1D1
2A2B2C2D2
3A3B3C3D3
4A4B4C4D4
5A5B5C5D5
6A6B6C6D6
7A7B7C7D7
8A8B8C8D8
9A9B9C9D9
10A10B10C10D10
11A11B11C11D11
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "0 A0 B0 C0 D0\n", + "1 A1 B1 C1 D1\n", + "2 A2 B2 C2 D2\n", + "3 A3 B3 C3 D3\n", + "4 A4 B4 C4 D4\n", + "5 A5 B5 C5 D5\n", + "6 A6 B6 C6 D6\n", + "7 A7 B7 C7 D7\n", + "8 A8 B8 C8 D8\n", + "9 A9 B9 C9 D9\n", + "10 A10 B10 C10 D10\n", + "11 A11 B11 C11 D11" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t1.append([t2,t3]) # 相当于pd.concat([t1, t2, t3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 连接(Join)\n", + "\n", + "join(on=None, how='left', lsuffix='', rsuffix='', sort=False)\n", + "\n", + "on:join的键,默认是矩阵的index, how:join方式,left-相当于左连接,outer,inner\n", + "\n", + "更多查看[Database-style DataFrame or named Series joining/merging](https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#database-style-dataframe-or-named-series-joining-merging)\n", + "\n", + "[Combining Datasets: Merge and Join](https://jakevdp.github.io/PythonDataScienceHandbook/03.07-merge-and-join.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----left----\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AB
K0A0B0
K1A1B1
K2A2B2
\n", + "
" + ], + "text/plain": [ + " A B\n", + "K0 A0 B0\n", + "K1 A1 B1\n", + "K2 A2 B2" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---right----\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CD
K0C0D0
K2C2D2
K3C3D3
\n", + "
" + ], + "text/plain": [ + " C D\n", + "K0 C0 D0\n", + "K2 C2 D2\n", + "K3 C3 D3" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
K0A0B0C0D0
K1A1B1NaNNaN
K2A2B2C2D2
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "K0 A0 B0 C0 D0\n", + "K1 A1 B1 NaN NaN\n", + "K2 A2 B2 C2 D2" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],\n", + " 'B': ['B0', 'B1', 'B2']},\n", + " index=['K0', 'K1', 'K2'])\n", + "\n", + "print('----left----')\n", + "display(left)\n", + "\n", + "right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],\n", + " 'D': ['D0', 'D2', 'D3']},\n", + " index=['K0', 'K2', 'K3'])\n", + "print('---right----')\n", + "display(right)\n", + "\n", + "left.join(right) # 相当于 pd.merge(left, right, left_index=True, right_index=True, how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Image(url=\"http://static.cyub.vip/images/202001/pandas.join.left.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
K0A0B0C0D0
K1A1B1NaNNaN
K2A2B2C2D2
K3NaNNaNC3D3
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "K0 A0 B0 C0 D0\n", + "K1 A1 B1 NaN NaN\n", + "K2 A2 B2 C2 D2\n", + "K3 NaN NaN C3 D3" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left.join(right, how='outer') # 相当于pd.merge(left, right, left_index=True, right_index=True, how='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Image(url=\"http://static.cyub.vip/images/202001/pandas.join.outer.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
K0A0B0C0D0
K2A2B2C2D2
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "K0 A0 B0 C0 D0\n", + "K2 A2 B2 C2 D2" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left.join(right, how='inner') #相当于pd.merge(left, right, left_index=True, right_index=True, how='inner')" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Image(url=\"http://static.cyub.vip/images/202001/pandas.join.inner.png\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 根据某一列进行join\n", + "\n", + "left.join(right, on=key_or_keys)= pd.merge(left, right, left_on=key_or_keys, right_index=True,\n", + " how='left', sort=False) // 使用left矩阵的key_or_keys列与right矩阵的index进行join" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----left----\n", + " A B key\n", + "0 A0 B0 K0\n", + "1 A1 B1 K1\n", + "2 A2 B2 K0\n", + "3 A3 B3 K1\n", + "----right----\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CD
K0C0D0
K1C1D1
\n", + "
" + ], + "text/plain": [ + " C D\n", + "K0 C0 D0\n", + "K1 C1 D1" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABkeyCD
0A0B0K0C0D0
1A1B1K1C1D1
2A2B2K0C0D0
3A3B3K1C1D1
\n", + "
" + ], + "text/plain": [ + " A B key C D\n", + "0 A0 B0 K0 C0 D0\n", + "1 A1 B1 K1 C1 D1\n", + "2 A2 B2 K0 C0 D0\n", + "3 A3 B3 K1 C1 D1" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],\n", + " 'B': ['B0', 'B1', 'B2', 'B3'],\n", + " 'key': ['K0', 'K1', 'K0', 'K1']})\n", + "\n", + "print('----left----')\n", + "print(left)\n", + "\n", + "right = pd.DataFrame({'C': ['C0', 'C1'],\n", + " 'D': ['D0', 'D1']},\n", + " index=['K0', 'K1'])\n", + "\n", + "print('----right----')\n", + "display(right)\n", + "\n", + "\n", + "left.join(right, on='key') # 相当于pd.merge(left, right, left_on='key', right_index=True,how='left', sort=False);" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Image(url=\"http://static.cyub.vip/images/202001/pandas.join.key.left.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----left----\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABkey1key2
0A0B0K0K0
1A1B1K0K1
2A2B2K1K0
3A3B3K2K1
\n", + "
" + ], + "text/plain": [ + " A B key1 key2\n", + "0 A0 B0 K0 K0\n", + "1 A1 B1 K0 K1\n", + "2 A2 B2 K1 K0\n", + "3 A3 B3 K2 K1" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----right----\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CD
K0K0C0D0
K1K0C1D1
K2K0C2D2
K3K11C3D3
\n", + "
" + ], + "text/plain": [ + " C D\n", + "K0 K0 C0 D0\n", + "K1 K0 C1 D1\n", + "K2 K0 C2 D2\n", + "K3 K11 C3 D3" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABkey1key2CD
0A0B0K0K0C0D0
1A1B1K0K1NaNNaN
2A2B2K1K0C1D1
3A3B3K2K1NaNNaN
\n", + "
" + ], + "text/plain": [ + " A B key1 key2 C D\n", + "0 A0 B0 K0 K0 C0 D0\n", + "1 A1 B1 K0 K1 NaN NaN\n", + "2 A2 B2 K1 K0 C1 D1\n", + "3 A3 B3 K2 K1 NaN NaN" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#### 多列的join\n", + "\n", + "left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],\n", + " 'B': ['B0', 'B1', 'B2', 'B3'],\n", + " 'key1': ['K0', 'K0', 'K1', 'K2'],\n", + " 'key2': ['K0', 'K1', 'K0', 'K1']})\n", + "\n", + "print('----left----')\n", + "display(left)\n", + "\n", + "index = pd.MultiIndex.from_tuples([('K0', 'K0'), ('K1', 'K0'),\n", + " ('K2', 'K0'), ('K3', 'K11')])\n", + "\n", + "\n", + "right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],\n", + " 'D': ['D0', 'D1', 'D2', 'D3']},\n", + " index=index)\n", + "\n", + "print('----right----')\n", + "display(right)\n", + "\n", + "left.join(right, on=['key1', 'key2'])" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Image(url=\"http://static.cyub.vip/images/202001/pandas.join.keys.left.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABkey1key2CD
0A0B0K0K0C0D0
2A2B2K1K0C1D1
\n", + "
" + ], + "text/plain": [ + " A B key1 key2 C D\n", + "0 A0 B0 K0 K0 C0 D0\n", + "2 A2 B2 K1 K0 C1 D1" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left.join(right, on=['key1', 'key2'], how='inner')" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Image(url=\"http://static.cyub.vip/images/202001/pandas.join.keys.inner.png\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 数据导入导出" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 从csv中导入数据" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id日期游戏id游戏名称国家国家码下载数下载用户数成功下载数成功下载用户数安装数安装用户数
075643162020-01-271Uphill Rush Water Park Racing俄罗斯RU111111
175643172020-01-271Uphill Rush Water Park Racing肯尼亚KE222200
275643182020-01-271Uphill Rush Water Park Racing刚果金CD110000
375643192020-01-271Uphill Rush Water Park Racing尼泊尔NP110000
475643202020-01-271Uphill Rush Water Park Racing索马里SO111111
.......................................
17988680104812020-02-02175Soccer Star 2022 World Legend: Football game赞比亚ZM220000
17988780104822020-02-02175Soccer Star 2022 World Legend: Football game尼日利亚NG112222
17988880104832020-02-02175Soccer Star 2022 World Legend: Football game埃及EG220000
17988980104842020-02-02175Soccer Star 2022 World Legend: Football game科特迪瓦CI332222
17989080104852020-02-02175Soccer Star 2022 World Legend: Football game约旦JO110000
\n", + "

179891 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " id 日期 游戏id \\\n", + "0 7564316 2020-01-27 1 \n", + "1 7564317 2020-01-27 1 \n", + "2 7564318 2020-01-27 1 \n", + "3 7564319 2020-01-27 1 \n", + "4 7564320 2020-01-27 1 \n", + "... ... ... ... \n", + "179886 8010481 2020-02-02 175 \n", + "179887 8010482 2020-02-02 175 \n", + "179888 8010483 2020-02-02 175 \n", + "179889 8010484 2020-02-02 175 \n", + "179890 8010485 2020-02-02 175 \n", + "\n", + " 游戏名称 国家 国家码 下载数 下载用户数 \\\n", + "0 Uphill Rush Water Park Racing 俄罗斯 RU 1 1 \n", + "1 Uphill Rush Water Park Racing 肯尼亚 KE 2 2 \n", + "2 Uphill Rush Water Park Racing 刚果金 CD 1 1 \n", + "3 Uphill Rush Water Park Racing 尼泊尔 NP 1 1 \n", + "4 Uphill Rush Water Park Racing 索马里 SO 1 1 \n", + "... ... ... .. ... ... \n", + "179886 Soccer Star 2022 World Legend: Football game 赞比亚 ZM 2 2 \n", + "179887 Soccer Star 2022 World Legend: Football game 尼日利亚 NG 1 1 \n", + "179888 Soccer Star 2022 World Legend: Football game 埃及 EG 2 2 \n", + "179889 Soccer Star 2022 World Legend: Football game 科特迪瓦 CI 3 3 \n", + "179890 Soccer Star 2022 World Legend: Football game 约旦 JO 1 1 \n", + "\n", + " 成功下载数 成功下载用户数 安装数 安装用户数 \n", + "0 1 1 1 1 \n", + "1 2 2 0 0 \n", + "2 0 0 0 0 \n", + "3 0 0 0 0 \n", + "4 1 1 1 1 \n", + "... ... ... ... ... \n", + "179886 0 0 0 0 \n", + "179887 2 2 2 2 \n", + "179888 0 0 0 0 \n", + "179889 2 2 2 2 \n", + "179890 0 0 0 0 \n", + "\n", + "[179891 rows x 12 columns]" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.read_csv('../dataset/game_daily_stats_20200127_20200202.csv', names=['id', '日期', '游戏id', '游戏名称', '国家', '国家码', '下载数', '下载用户数', '成功下载数', '成功下载用户数','安装数', '安装用户数'],na_filter = False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 导出数据到csv" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv('/tmp/pandas.csv', encoding=\"utf_8_sig\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/jupyter/Spark上手示例1:RDD操作.ipynb b/docs/jupyter/Spark上手示例1:RDD操作.ipynb new file mode 100644 index 0000000..b0da35a --- /dev/null +++ b/docs/jupyter/Spark上手示例1:RDD操作.ipynb @@ -0,0 +1,610 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# 引入pyspark,并创建spark上下文\n", + "import findspark\n", + "findspark.init()\n", + "import pyspark\n", + "sc = pyspark.SparkContext()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. 创建RDD的第一种方式,读外部数据,比如本地磁盘文件" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "rdd = sc.textFile('./dataset/Goodbye_Object_Oriented_Programming.txt')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyspark.rdd.RDD" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 查看rdd类型\n", + "type(rdd)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.1 RDD之转换(Transformation)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", + "Wall time: 31 µs\n" + ] + } + ], + "source": [ + "%%time\n", + "## map是转换操作的一种,这时候只是形成DAG\n", + "rdd = rdd.map(lambda x: len(x))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 RDD之行动(Action)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13187\n", + "CPU times: user 12 ms, sys: 0 ns, total: 12 ms\n", + "Wall time: 1.58 s\n" + ] + } + ], + "source": [ + "%%time\n", + "## reduce是行动操作的一种, 这个时候才真正的计算\n", + "charCount = rdd.reduce(lambda x, y: x+y)\n", + "\n", + "print(charCount)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 328 2260 13687 ./dataset/Goodbye_Object_Oriented_Programming.txt\r\n" + ] + } + ], + "source": [ + "! wc ./dataset/Goodbye_Object_Oriented_Programming.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.3 示例:统计单词出现的次数" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['I’ve been programming in Object Oriented languages for decades. The first OO language I used was C++ and then Smalltalk and finally .NET and Java.',\n", + " '']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wordRdd = sc.textFile('./dataset/Goodbye_Object_Oriented_Programming.txt')\n", + "\n", + "# take操作就是一种Action, 返回前n数据\n", + "wordRdd.take(2) " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# 将每一行文本打散\n", + "wordRdd = wordRdd.map(lambda line: line.split(' '))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['I’ve',\n", + " 'been',\n", + " 'programming',\n", + " 'in',\n", + " 'Object',\n", + " 'Oriented',\n", + " 'languages',\n", + " 'for',\n", + " 'decades.',\n", + " 'The',\n", + " 'first',\n", + " 'OO',\n", + " 'language',\n", + " 'I',\n", + " 'used',\n", + " 'was',\n", + " 'C++',\n", + " 'and',\n", + " 'then',\n", + " 'Smalltalk',\n", + " 'and',\n", + " 'finally',\n", + " '.NET',\n", + " 'and',\n", + " 'Java.'],\n", + " ['']]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wordRdd.take(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2493" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 扁平化处理\n", + "\n", + "wordRdd = wordRdd.flatMap(lambda x: x)\n", + "\n", + "# 查看有多少个单词\n", + "wordRdd.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['I’ve', 'been']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 查看前两条数据\n", + "wordRdd.take(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2260" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 过滤掉空格数据\n", + "wordRdd = wordRdd.filter(lambda x: x != '')\n", + "\n", + "# 查看有多少个单词\n", + "wordRdd.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('I’ve', 1), ('been', 1)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 转换成key-value形式rdd 即 (key, value)\n", + "wordRdd = wordRdd.map(lambda word: (word, 1))\n", + "\n", + "wordRdd.take(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('face', 1),\n", + " ('was', 18),\n", + " ('Monkey', 2),\n", + " ('how', 4),\n", + " ('Just', 1),\n", + " ('for', 11),\n", + " ('Directories', 1),\n", + " ('could', 4),\n", + " ('gained', 1),\n", + " ('AGAIN', 1)]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wordRdd = wordRdd.reduceByKey(lambda x, y: x+y)\n", + "\n", + "# 查看一下\n", + "wordRdd.take(10)\n", + "\n", + "# 查看全部\n", + "# wordRdd.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# 使用pandas继续计算\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
wordcount
0face1
1was18
2Monkey2
3how4
4Just1
5for11
6Directories1
7could4
8gained1
9AGAIN1
\n", + "
" + ], + "text/plain": [ + " word count\n", + "0 face 1\n", + "1 was 18\n", + "2 Monkey 2\n", + "3 how 4\n", + "4 Just 1\n", + "5 for 11\n", + "6 Directories 1\n", + "7 could 4\n", + "8 gained 1\n", + "9 AGAIN 1" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame(wordRdd.collect())\n", + "\n", + "# 设置栏位名字\n", + "df.columns = ['word', 'count']\n", + "\n", + "\n", + "# 查看前10条数据\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
wordcount
263the121
271to57
576of47
358and45
589a41
797is38
136in35
593I32
685that29
645The26
\n", + "
" + ], + "text/plain": [ + " word count\n", + "263 the 121\n", + "271 to 57\n", + "576 of 47\n", + "358 and 45\n", + "589 a 41\n", + "797 is 38\n", + "136 in 35\n", + "593 I 32\n", + "685 that 29\n", + "645 The 26" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 查看出现次数最多的十个单词\n", + "df =df.sort_values(by='count', ascending=False)\n", + "\n", + "\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# 停止spark上下文\n", + "sc.stop()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/jupyter/Spark上手示例2:DataFrame操作.ipynb b/docs/jupyter/Spark上手示例2:DataFrame操作.ipynb new file mode 100644 index 0000000..8fd8250 --- /dev/null +++ b/docs/jupyter/Spark上手示例2:DataFrame操作.ipynb @@ -0,0 +1,1028 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/vnd.plotly.v1+html": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# 导入相关库\n", + "from pyspark.sql import Row, SparkSession,SQLContext\n", + "from pyspark.sql.types import IntegerType,DateType, TimestampType\n", + "from pyspark.sql.functions import col, udf,to_date,from_unixtime,countDistinct\n", + "\n", + "# 计算处理\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import time\n", + "\n", + "# 图表相关\n", + "import plotly.plotly as py\n", + "import plotly\n", + "plotly.offline.init_notebook_mode(connected=True)\n", + "import plotly.graph_objs as go\n", + "\n", + "# jupyter使用matplot的配置\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# 创建spark上下文,并设置10个分区\n", + "spark = SparkSession.builder.appName(\"vas项目\").config(\"spark.default.parallelism\", 10).getOrCreate()\n", + "sc = spark.sparkContext" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 8 ms, sys: 0 ns, total: 8 ms\n", + "Wall time: 41.3 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "logPaths = ['/var/log/vas-project/vas_data/201807',\n", + " '/var/log/vas-project/vas_data/201808', \n", + " '/var/log/vas-project/vas_data/201809',\n", + " '/var/log/vas-project/vas_data/201810'\n", + " ];\n", + "df = spark.read.format('json').load(logPaths)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+------------+------+--------------------+---------------+-----+-----------+---+\n", + "|brand|country_code|device| events| ip_address|model| partner|ref|\n", + "+-----+------------+------+--------------------+---------------+-----+-----------+---+\n", + "| Itel| ML| sp|[[click, 15358391...| 217.64.103.74| P13|searchturbo| m|\n", + "| Itel| EG| sp|[[click, 15358391...|196.141.135.133| A32F|searchturbo| m|\n", + "| Itel| NG| sp|[[click, 15358391...| 197.210.226.58| P32|searchturbo| m|\n", + "| Itel| IN| sp|[[click, 15358391...| 157.48.123.237| A22|searchturbo| m|\n", + "| Itel| EG| sp|[[click, 15358391...| 105.199.93.33| A32F|searchturbo| m|\n", + "| Itel| EG| sp|[[click, 15358391...|196.141.135.133| A32F|searchturbo| m|\n", + "| Itel| MA| sp|[[click, 15358391...| 41.249.147.213| A32F|searchturbo| m|\n", + "| Itel| CI| sp|[[click, 15358391...| 154.0.26.115| P32| Unknown| m|\n", + "| Itel| BJ| sp|[[click, 15358391...|197.234.221.243| A32F|searchturbo| m|\n", + "| Itel| EG| sp|[[click, 15358391...|196.141.135.133| A32F|searchturbo| m|\n", + "+-----+------------+------+--------------------+---------------+-----+-----------+---+\n", + "\n", + "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", + "Wall time: 11.6 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "# 查看前10条数据\n", + "\n", + "df.limit(10).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", + "Wall time: 6.31 s\n" + ] + }, + { + "data": { + "text/plain": [ + "2075513" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "# 查看总共记录数\n", + "\n", + "df.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 例1. 按品牌机型统计" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+---------+------+\n", + "|brand| model| count|\n", + "+-----+---------+------+\n", + "| Itel| A52B| 19|\n", + "| Itel| A14| 10136|\n", + "| Itel| S13 Pro| 151|\n", + "| Itel| A16 Plus| 229|\n", + "| Itel| A52| 12812|\n", + "| Itel| A45| 68690|\n", + "| Itel| A22| 69811|\n", + "| Itel| A16| 4210|\n", + "| Itel| S11X| 27366|\n", + "| Itel| A62| 11161|\n", + "|Spice| Z213| 77393|\n", + "| Itel| S11XB| 137|\n", + "| Itel| A15| 11744|\n", + "| Itel| P32|550753|\n", + "| Itel| P13 Plus| 176|\n", + "| Itel|A44 Power| 32|\n", + "| Itel| A32F|537792|\n", + "| Itel|itel A32F| 67001|\n", + "| Itel| A23| 1493|\n", + "| Itel| S13| 19634|\n", + "+-----+---------+------+\n", + "only showing top 20 rows\n", + "\n", + "CPU times: user 8 ms, sys: 0 ns, total: 8 ms\n", + "Wall time: 10.8 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "# 按照品牌(brand)和机型(model)进行聚合\n", + "\n", + "brand_model_count = df.select('brand', 'partner', 'model').groupBy('brand','model').count()\n", + "\n", + "# 打印一下\n", + "brand_model_count.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4 ms, sys: 8 ms, total: 12 ms\n", + "Wall time: 10.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "# 换行成pandas\n", + "\n", + "pd_df = brand_model_count.toPandas()\n", + "\n", + "# 查看前5条\n", + "pd_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
brandmodelcount
Itel_S13ProItelS13Pro3
Itel_A52BItelA52B19
Itel_A44 PowerItelA44 Power32
Itel_S11XBItelS11XB137
Itel_S13 ProItelS13 Pro151
\n", + "
" + ], + "text/plain": [ + " brand model count\n", + "Itel_S13Pro Itel S13Pro 3\n", + "Itel_A52B Itel A52B 19\n", + "Itel_A44 Power Itel A44 Power 32\n", + "Itel_S11XB Itel S11XB 137\n", + "Itel_S13 Pro Itel S13 Pro 151" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 数据转换,排序处理下\n", + "\n", + "pd_df.index = pd_df['brand'] + '_' + pd_df['model']\n", + "pd_df = pd_df.sort_values(by = ['brand', 'count'])\n", + "\n", + "pd_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count
Itel_S13Pro3
Itel_A52B19
Itel_A44 Power32
Itel_S11XB137
Itel_S13 Pro151
\n", + "
" + ], + "text/plain": [ + " count\n", + "Itel_S13Pro 3\n", + "Itel_A52B 19\n", + "Itel_A44 Power 32\n", + "Itel_S11XB 137\n", + "Itel_S13 Pro 151" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 只取count列\n", + "pd_df = pd_df[['count']]\n", + "\n", + "# 查看一下\n", + "pd_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# 图表显示下\n", + "\n", + "pd_df.plot(kind='bar', figsize=(15, 5))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 例2. 按国家查看访问量" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 8 ms, sys: 0 ns, total: 8 ms\n", + "Wall time: 62.6 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "# 按照country_code 进行聚合\n", + "\n", + "country_code_count = df.select('country_code').groupBy('country_code').count()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n", + "Wall time: 7.5 s\n" + ] + }, + { + "data": { + "text/plain": [ + "[Row(country_code='DZ', count=1027),\n", + " Row(country_code='LT', count=12),\n", + " Row(country_code='MM', count=18),\n", + " Row(country_code='CI', count=95814),\n", + " Row(country_code='SC', count=8)]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "# 显示前5条数据\n", + "\n", + "country_code_count.limit(5).collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_codecount
0DZ1027
1LT12
2MM18
3CI95814
4SC8
\n", + "
" + ], + "text/plain": [ + " country_code count\n", + "0 DZ 1027\n", + "1 LT 12\n", + "2 MM 18\n", + "3 CI 95814\n", + "4 SC 8" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 转换成pandas\n", + "\n", + "codePandas = country_code_count.toPandas()\n", + "\n", + "codePandas.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_codecount
74IN440134
139NG432983
58Unknown309625
39BD192363
146VN96927
3CI95814
130MA69185
33GH49175
77CM44962
115SN42320
\n", + "
" + ], + "text/plain": [ + " country_code count\n", + "74 IN 440134\n", + "139 NG 432983\n", + "58 Unknown 309625\n", + "39 BD 192363\n", + "146 VN 96927\n", + "3 CI 95814\n", + "130 MA 69185\n", + "33 GH 49175\n", + "77 CM 44962\n", + "115 SN 42320" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 按照访问量排序下\n", + "\n", + "codePandas = codePandas.sort_values(by='count', ascending=False)\n", + "\n", + "codePandas.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "550385a639cf43a1824e9ab6bcac42d3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value=''.format(src))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 例3. 每小时访问量走势图" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n", + "Wall time: 387 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "# 增加3个栏位, timestamp, hour, month\n", + "\n", + "df = df.withColumn('timestamp', df.events[0].timestamp)\n", + "df = df.withColumn('hour', from_unixtime(df.events[0].timestamp, 'HH'))\n", + "df = df.withColumn('month', from_unixtime(df.events[0].timestamp, 'yyyy-MM'))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+------------+------+--------------------+---------------+-----+-----------+---+----------+----+-------+\n", + "|brand|country_code|device| events| ip_address|model| partner|ref| timestamp|hour| month|\n", + "+-----+------------+------+--------------------+---------------+-----+-----------+---+----------+----+-------+\n", + "| Itel| ML| sp|[[click, 15358391...| 217.64.103.74| P13|searchturbo| m|1535839148| 05|2018-09|\n", + "| Itel| EG| sp|[[click, 15358391...|196.141.135.133| A32F|searchturbo| m|1535839156| 05|2018-09|\n", + "| Itel| NG| sp|[[click, 15358391...| 197.210.226.58| P32|searchturbo| m|1535839161| 05|2018-09|\n", + "| Itel| IN| sp|[[click, 15358391...| 157.48.123.237| A22|searchturbo| m|1535839162| 05|2018-09|\n", + "| Itel| EG| sp|[[click, 15358391...| 105.199.93.33| A32F|searchturbo| m|1535839163| 05|2018-09|\n", + "| Itel| EG| sp|[[click, 15358391...|196.141.135.133| A32F|searchturbo| m|1535839164| 05|2018-09|\n", + "| Itel| MA| sp|[[click, 15358391...| 41.249.147.213| A32F|searchturbo| m|1535839167| 05|2018-09|\n", + "| Itel| CI| sp|[[click, 15358391...| 154.0.26.115| P32| Unknown| m|1535839174| 05|2018-09|\n", + "| Itel| BJ| sp|[[click, 15358391...|197.234.221.243| A32F|searchturbo| m|1535839174| 05|2018-09|\n", + "| Itel| EG| sp|[[click, 15358391...|196.141.135.133| A32F|searchturbo| m|1535839175| 05|2018-09|\n", + "+-----+------------+------+--------------------+---------------+-----+-----------+---+----------+----+-------+\n", + "only showing top 10 rows\n", + "\n", + "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n", + "Wall time: 370 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "# 按照小时聚合下\n", + "\n", + "group_by_hour = df.select('hour').groupBy('hour').count()\n", + "df.show(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 8 ms, sys: 4 ms, total: 12 ms\n", + "Wall time: 10.4 s\n" + ] + } + ], + "source": [ + "%%time\n", + "# dataframe 转换成padnas\n", + "group_by_hour_pandas_df = group_by_hour.toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hourcount
hour
00118156
11112722
2285696
3385365
4477598
\n", + "
" + ], + "text/plain": [ + " hour count\n", + "hour \n", + "0 0 118156\n", + "1 1 112722\n", + "2 2 85696\n", + "3 3 85365\n", + "4 4 77598" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 按照访问量进行排序下\n", + "group_by_hour_pandas_df = group_by_hour_pandas_df.sort_values(by='hour')\n", + "\n", + "# 强制转换整数类型\n", + "group_by_hour_pandas_df.hour = group_by_hour_pandas_df['hour'].map(int)\n", + "\n", + "# 将小时设置pandas索引\n", + "group_by_hour_pandas_df.index = group_by_hour_pandas_df.hour\n", + "\n", + "group_by_hour_pandas_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# 画个图表\n", + "hourcount = (group_by_hour_pandas_df.to_dict())['count']\n", + "index = list(range(0,24))\n", + "cols = []\n", + "for i in index:\n", + " if i not in hourcount:\n", + " cols.append(0)\n", + " else:\n", + " cols.append(hourcount[i])\n", + "\n", + "group_by_hour_pandas_df = pd.DataFrame({'num': cols})\n", + "\n", + "\n", + "group_by_hour_pandas_df.plot(title='vas project-access count by hour', kind='line', figsize=(15, 5), xticks=group_by_hour_pandas_df.index)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 例4. 每个月访问量" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 116 ms, sys: 0 ns, total: 116 ms\n", + "Wall time: 10.3 s\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "%%time\n", + "group_by_month =df.select('month').groupBy('month').count()\n", + "\n", + "group_by_month_pandas = group_by_month.limit(5).toPandas()\n", + "group_by_month_pandas.index = group_by_month_pandas['month']\n", + "group_by_month_pandas.index.name = 'm';\n", + "group_by_month_pandas = group_by_month_pandas.sort_values(by='month', ascending=True)\n", + "\n", + "## 每月uv\n", + "group_by_month_pandas[['count']].plot(kind='bar', figsize=(15, 5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/language/Go.md b/docs/language/Go.md index c1031a0..7d7602e 100644 --- a/docs/language/Go.md +++ b/docs/language/Go.md @@ -256,6 +256,75 @@ channel收发遵循FIFO原则,其底层是hchan结构指针,创建通道使 - slice - channel +### 为啥 channel 会有 close 这个操作, 在哪些场景下会用到这个操作 ? + +在 Go 语言中,channel 的 close 操作用于向 channel 的接收方明确地通知发送操作已经完成。关闭一个 channel 可以表达“没有更多的数据将被发送到这个 channel”这一信号。这是一种控制信号,帮助接收方理解数据流的生命周期,并且可以避免在 channel 上进行无限等待。 + +#### 使用 close 的场景 + +1. **通知多个接收者完成处理**: + + 当使用一个 channel 来分发任务或数据给多个协程(goroutines)时,关闭 channel 是一种告知所有接收者没有更多数据要处理的有效方法。接收者可以通过检测 channel 是否已关闭来适时停止处理。 + +2. **控制循环退出**: + + 在接收数据时,可以使用 for range 循环从 channel 接收数据。当 channel 被关闭,并且 channel 中已经没有待处理的数据时,for range 循环会自动结束。这使得编码简洁,并且逻辑清晰。 + +3. **防止资源泄露**: + + 如果不关闭不再使用的 channel,可能会导致内存资源没有得到释放,特别是在 channel 还保持着一些数据项的情况下。尽管 Go 的垃圾回收机制会回收未引用的对象,但显式关闭 channel 是一个好的实践,它可以清晰地表达程序设计者的意图。 + +4. **使用 select 的默认操作**: + + 在使用 select 语句处理多个 channel 的时候,关闭一个 channel 可以用于触发其他 case 的执行。特别是在一些需要优雅退出的并发模式中,关闭 channel 可以促使 select 快速响应并处理结束逻辑。 + +#### 示例:数据处理和广播信号 + +假设有一个数据处理任务,需要将数据分批发送到多个处理协程,处理完成后再汇总结果。这里可以使用关闭 channel 的方式来告知所有处理协程,数据已经发送完毕: + +```go +func processData(dataChunks [][]int) []int { + var results []int + resultChan := make(chan int) + dataChan := make(chan int, 100) + + // 启动多个工作协程 + for i := 0; i < 5; i++ { + go func() { + for data := range dataChan { + result := process(data) // 假设有一个处理函数 + resultChan <- result + } + }() + } + + // 发送数据 + go func() { + for _, chunk := range dataChunks { + for _, data := range chunk { + dataChan <- data + } + } + close(dataChan) + }() + + // 接收结果 + go func() { + for i := 0; i < len(dataChunks); i++ { + result := <-resultChan + results = append(results, result) + } + close(resultChan) + }() + + return results +} +``` + +在这个示例中,通过关闭 dataChan 来告知工作协程不会再有新的数据发送,这时协程可以结束从 channel 接收数据的操作。关闭 resultChan 则用来表示所有结果已经处理完毕,可以进行后续步骤。 + +总结来说,关闭一个 channel 是一种向接收方传递完成信号的方法,它在多协程协作的环境中尤为有用,有助于提高代码的可读性和安全性。 + ## Go如何避免内存的对象频繁分配和回收的问题? 可以考虑使用对象缓存池sync.Pool @@ -877,6 +946,101 @@ func As(err error, target interface{}) bool // 判断err是否为target类型 [一篇文章带你轻松搞懂Golang的error处理_Golang_脚本之家](https://www.jb51.net/article/254917.htm) +## Go错误处理机制为啥不采用Java的try-catch的异常机制? + +Go 语言选择不使用 Java 中的异常机制,而是采用返回错误(error)的方式处理异常情况,这主要是基于几个设计目标和考虑: + +- **简明性和可预测性**:在 Go 语言中,错误被视为常见的、可预料的结果,而不是异常情况。通过显式地返回错误,程序员被迫处理错误,这可以使得错误处理更加显式和清晰,减少了忽视错误处理的可能性。 + +- **控制流简单化**:使用异常机制时,代码的执行流可以因为异常的抛出而在任何点被中断,这使得程序的控制流变得复杂和难以跟踪。相反,通过返回错误,错误处理和正常逻辑清晰地分离,控制流更加直观。 + +- **性能考虑**:异常处理机制通常涉及栈解 unwinding 过程,这在性能上可能带来开销。Go 语言通过简单的值返回避免了这种开销。 + +- **交叉语言调用简化**:Go 语言在设计时考虑到了与 C 语言的互操作性。在 C 语言中没有异常机制,使用错误码是常见做法。Go 的错误处理方式与之兼容,简化了跨语言调用的复杂性。 + +关于是否是一个好的设计,这取决于具体的使用场景和个人偏好。这种设计强化了错误处理的重要性,并鼓励开发者更加关注程序可能出现的错误情况,从而可能增强程序的健壮性。但是,这也可能导致代码中充满大量的错误处理逻辑,有时可能显得冗余。 + +Java 的异常机制有其优点,特别是在错误需要跨多个调用层级传播时,异常提供了一种非常便捷的方式。然而,它也有一些常见的问题: + +- **异常被忽略**:Java 中的异常可以被捕获后不做任何处理,这可能隐藏程序中的错误。 +- **代码复杂性**:异常的抛出和捕获可能使得程序逻辑变得难以理解和维护,特别是当异常可以在多个层级被抛出和捕获时。 +- **性能问题**:异常处理可能引入性能损耗,特别是在异常频繁抛出的场景中。 + +总体来说,Go 的错误处理方式倾向于更加显式和可控,而 Java 的异常机制则提供了一种灵活但有时可能不够清晰的错误处理策略。选择哪种方式更好,很大程度上取决于特定项目的需求和开发团队的偏好。 + +## Go为什么采用组合机制而不是使用继承机制? + +Go 语言没有传统意义上的类和继承机制,这是其设计哲学中的一部分,旨在简化语言和避免常见的面向对象编程的一些复杂性和问题。下面是 Go 语言这样设计的一些原因及其替代方式: + +### 为什么 Go 没有传统的继承? + +- **简化语言设计**:Go 的设计哲学是保持语言的简洁和高效。继承是一个强大但复杂的功能,可以导致多种编程问题,如复杂的依赖关系和难以预测的行为。 + +- **避免继承带来的问题**: + - **脆弱的基类问题**:基类的改变可能影响到大量的派生类。 + - **深层继承结构导致的复杂性**:随着继承链的增长,理解和维护代码变得更加困难。 + - **多重继承的复杂性**:如 C++ 中的多重继承可能导致菱形继承问题,增加了语言和编译器的复杂性。 + +### Go 如何实现多态? + +尽管 Go 没有继承,它通过接口来支持多态性。在 Go 中,接口是一组方法签名的集合,任何类型只要实现了这些方法,就被认为实现了该接口。这种方式与继承不同,更加灵活和简洁: + +- **接口隐式实现**:类型不需要声明它实现了哪个接口,这降低了代码之间的耦合。 +- **组合优于继承**:Go 通过组合(有时候通过嵌入结构体)来实现代码的复用,这比继承更加直接和清晰。 + +### Embedded Struct 算不算继承? + +Embedded struct(嵌入结构体)在 Go 中被用作实现类似继承的功能,但它更准确地被描述为组合。通过嵌入一个结构体,一个新的结构体可以直接访问嵌入结构体的方法和字段,这提供了一种方式来复用代码: + +- **不是真正的继承**:虽然看起来类似,嵌入结构体并不提供传统意义上的多态。 +- **代码复用和扩展**:它允许一种灵活的方式来扩展功能,而无需继承的复杂性。 + +### 传统继承的问题 + +- **过度耦合**:子类和父类之间的关系过于紧密,改动父类可能会影响所有子类。 +- **隐藏的复杂性**:继承可以导致代码的行为不透明,增加理解和调试的难度。 +- **难以正确使用**:正确地设计和维护一个继承体系需要大量的设计经验和技术洞察力。 + +Go 的设计选择鼓励开发者采用更简单、更易于理解和维护的编程范式。通过接口和组合,Go 提供了一种强大的工具集来建构灵活且可维护的代码结构,避免了许多传统面向对象编程中常见的陷阱。 + +### Go 中 channel 跟 Java 中 BlockingQueue 又有啥区别 ? + +Go 的 channel 和 Java 的 BlockingQueue 都是用于不同线程或协程间的通信机制,但它们的设计哲学和使用场景有所不同。这两种机制都用于解决并发编程中的同步问题,但具体的实现和适用的场景有差异。 + +### Channel 与 BlockingQueue 的区别 + +1. 设计哲学: + - Go 的 Channel:Channel 是 Go 语言中的一等公民,用于在协程(goroutines)之间进行通信。它遵循“通过通信来共享内存,而不是通过共享内存来通信”的哲学。 + - Java 的 BlockingQueue:是 Java 并发包中的一部分,主要用于线程间的通信,尤其在生产者-消费者模型中。它依赖于共享内存和锁来实现线程安全。 + +2. 功能实现: + - Channel 支持多种模式,如无缓冲、有缓冲通道,可以非常灵活地控制协程间的数据流和同步。 + - BlockingQueue 是一个接口,Java 提供了多种实现(如 ArrayBlockingQueue, LinkedBlockingQueue),主要通过阻塞操作来实现生产者和消费者之间的同步。 + +3. 用途和应用场景: + - Channel 通常用于协程间的信号传递和数据交换,特别是在需要控制并发操作顺序时。 + - BlockingQueue 通常用于处理较大的数据流或者在多线程环境下缓存数据。 + +### 共享内存并发 vs. Channel 并发 +#### 共享内存并发 + +- 适用场景:适合复杂的数据结构共享,或者当有多个线程需要访问和修改同一数据时。在多核处理器上,这种方式可以有效利用缓存一致性协议。 +- 优点:可以实现细粒度的控制,对于某些高性能计算场景可以更直接地管理内存。 +- 缺点:容易产生竞态条件,编程模型更加复杂,需要精确地控制锁和同步。 + +#### Channel 并发 + +- 适用场景:适合事件驱动或消息驱动的应用,如网络服务或并行数据处理。在这些场景中,通信模式清晰,各部分之间的解耦更彻底。 +- 优点:简化了并发和同步的管理,代码通常更易于理解和维护。 +- 缺点:在极端的高性能需求下,可能会因为消息传递的开销而不如直接的内存访问高效。 + +#### 选择建议 + +- 如果问题适合通过明确的消息传递进行模块化设计,或者当系统的可维护性和清晰的并发模型比原始性能更重要时,使用 Channel。 +- 如果需要最大限度地控制性能,并且可以管理更复杂的同步策略和竞态风险,使用共享内存可能更合适。 + +在实际开发中,选择合适的并发策略依赖于具体问题、性能需求和团队的熟悉度。对于维护性和开发效率有较高要求的项目,Channel 往往是一个更易于管理的选择。 + ## 资料 [【Golang开发面经】蔚来(两轮技术面)](https://zhuanlan.zhihu.com/p/574580955) \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 696980f..34c9994 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -36,8 +36,14 @@ nav: - 镜像: container/image.md - cgroup: container/cgroup.md - namespace: container/namespace.md - - 语言: + - 开发语言: - Go: language/Go.md + - Jupyter: + - Go-Frameworks-Github-Fork-Stats: jupyter/Go-Frameworks-Github-Fork-Stats.ipynb + - Pandas完全指南: jupyter/Pandas完全指南.ipynb + - Spark上手示例: + - jupyter/Spark上手示例1:RDD操作.ipynb + - jupyter/Spark上手示例2:DataFrame操作.ipynb - QA: - redis: qa/redis.md - mysql: qa/mysql.md @@ -48,7 +54,7 @@ nav: - 消息队列: qa/queue.md - IO: qa/io.md - protobuf: qa/protobuf.md - - go: qa/go.md + - go: qa/go.md - 分布式: qa/dist.md - Elasticsearch: qa/es.md - docker: qa/docker.md @@ -118,7 +124,12 @@ theme: plugins: - search: - separator: '[\s\u200b\-]' + separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])' + - git-revision-date-localized: + type: iso_date + - glightbox + - mkdocs-jupyter + - print-site # Additional configuration extra: