{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Operating on Dask Dataframes with SQL\n",
    "\n",
    "[Dask-SQL](https://dask-sql.readthedocs.io/en/stable/) is an open source project and Python package leveraging [Apache Calcite](https://calcite.apache.org/) to provide a SQL frontend for [Dask](https://dask.org/) dataframe operations, allowing SQL users to take advantage of Dask's distributed capabilities without requiring an extensive knowledge of the dataframe API."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:13:12.525493Z",
     "iopub.status.busy": "2022-07-27T19:13:12.524706Z",
     "iopub.status.idle": "2022-07-27T19:13:17.625457Z",
     "shell.execute_reply": "2022-07-27T19:13:17.624479Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting dask-sql\r\n",
      "  Downloading dask_sql-2022.6.0-py3-none-any.whl (21.1 MB)\r\n",
      "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/21.1 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r",
      "\u001b[2K     \u001b[91m━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/21.1 MB\u001b[0m \u001b[31m136.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r",
      "\u001b[2K     \u001b[91m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.3/21.1 MB\u001b[0m \u001b[31m173.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r",
      "\u001b[2K     \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━\u001b[0m \u001b[32m18.0/21.1 MB\u001b[0m \u001b[31m189.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r",
      "\u001b[2K     \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m179.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r",
      "\u001b[2K     \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m179.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m71.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\r\n",
      "\u001b[?25h"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting uvicorn>=0.11.3\r\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  Downloading uvicorn-0.18.2-py3-none-any.whl (57 kB)\r\n",
      "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/57.0 KB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.0/57.0 KB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\r\n",
      "\u001b[?25hRequirement already satisfied: tabulate in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from dask-sql) (0.8.9)\r\n",
      "Requirement already satisfied: nest-asyncio in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from dask-sql) (1.5.5)\r\n",
      "Collecting tzlocal>=2.1\r\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  Downloading tzlocal-4.2-py3-none-any.whl (19 kB)\r\n",
      "Requirement already satisfied: pandas>=1.0.0 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from dask-sql) (1.4.2)\r\n",
      "Requirement already satisfied: pygments in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from dask-sql) (2.12.0)\r\n",
      "Requirement already satisfied: prompt-toolkit in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from dask-sql) (3.0.29)\r\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting jpype1>=1.0.2\r\n",
      "  Downloading JPype1-1.4.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (453 kB)\r\n",
      "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/453.8 KB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m453.8/453.8 KB\u001b[0m \u001b[31m69.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\r\n",
      "\u001b[?25hRequirement already satisfied: dask[dataframe,distributed]<=2022.5.2,>=2022.3.0 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from dask-sql) (2022.5.0)\r\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting fastapi>=0.61.1\r\n",
      "  Downloading fastapi-0.79.0-py3-none-any.whl (54 kB)\r\n",
      "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/54.6 KB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.6/54.6 KB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\r\n",
      "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (21.3)\r\n",
      "Requirement already satisfied: fsspec>=0.6.0 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (2022.3.0)\r\n",
      "Requirement already satisfied: toolz>=0.8.2 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (0.11.2)\r\n",
      "Requirement already satisfied: cloudpickle>=1.1.1 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (2.0.0)\r\n",
      "Requirement already satisfied: pyyaml>=5.3.1 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (6.0)\r\n",
      "Requirement already satisfied: partd>=0.3.10 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (1.2.0)\r\n",
      "Requirement already satisfied: numpy>=1.18 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (1.22.3)\r\n",
      "Requirement already satisfied: distributed==2022.05.0 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (2022.5.0)\r\n",
      "Requirement already satisfied: jinja2 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from distributed==2022.05.0->dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (3.1.1)\r\n",
      "Requirement already satisfied: urllib3 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from distributed==2022.05.0->dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (1.26.9)\r\n",
      "Requirement already satisfied: tblib>=1.6.0 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from distributed==2022.05.0->dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (1.7.0)\r\n",
      "Requirement already satisfied: tornado>=6.0.3 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from distributed==2022.05.0->dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (6.1)\r\n",
      "Requirement already satisfied: sortedcontainers!=2.0.0,!=2.0.1 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from distributed==2022.05.0->dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (2.4.0)\r\n",
      "Requirement already satisfied: msgpack>=0.6.0 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from distributed==2022.05.0->dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (1.0.3)\r\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: locket>=1.0.0 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from distributed==2022.05.0->dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (1.0.0)\r\n",
      "Requirement already satisfied: zict>=0.1.3 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from distributed==2022.05.0->dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (2.2.0)\r\n",
      "Requirement already satisfied: click>=6.6 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from distributed==2022.05.0->dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (8.1.3)\r\n",
      "Requirement already satisfied: psutil>=5.0 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from distributed==2022.05.0->dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (5.9.0)\r\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pydantic!=1.7,!=1.7.1,!=1.7.2,!=1.7.3,!=1.8,!=1.8.1,<2.0.0,>=1.6.2 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from fastapi>=0.61.1->dask-sql) (1.9.1)\r\n",
      "Collecting starlette==0.19.1\r\n",
      "  Downloading starlette-0.19.1-py3-none-any.whl (63 kB)\r\n",
      "\u001b[?25l"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/63.3 KB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m63.3/63.3 KB\u001b[0m \u001b[31m16.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\r\n",
      "\u001b[?25hRequirement already satisfied: anyio<5,>=3.4.0 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from starlette==0.19.1->fastapi>=0.61.1->dask-sql) (3.5.0)\r\n",
      "Requirement already satisfied: typing-extensions>=3.10.0 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from starlette==0.19.1->fastapi>=0.61.1->dask-sql) (4.2.0)\r\n",
      "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from pandas>=1.0.0->dask-sql) (2.8.2)\r\n",
      "Requirement already satisfied: pytz>=2020.1 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from pandas>=1.0.0->dask-sql) (2022.1)\r\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting pytz-deprecation-shim\r\n",
      "  Downloading pytz_deprecation_shim-0.1.0.post0-py2.py3-none-any.whl (15 kB)\r\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting h11>=0.8\r\n",
      "  Downloading h11-0.13.0-py3-none-any.whl (58 kB)\r\n",
      "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/58.2 KB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.2/58.2 KB\u001b[0m \u001b[31m18.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\r\n",
      "\u001b[?25h"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: wcwidth in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from prompt-toolkit->dask-sql) (0.2.5)\r\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from packaging>=20.0->dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (3.0.8)\r\n",
      "Requirement already satisfied: six>=1.5 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas>=1.0.0->dask-sql) (1.16.0)\r\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting tzdata\r\n",
      "  Downloading tzdata-2022.1-py2.py3-none-any.whl (339 kB)\r\n",
      "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/339.5 KB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m339.5/339.5 KB\u001b[0m \u001b[31m61.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\r\n",
      "\u001b[?25h"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: sniffio>=1.1 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from anyio<5,>=3.4.0->starlette==0.19.1->fastapi>=0.61.1->dask-sql) (1.2.0)\r\n",
      "Requirement already satisfied: idna>=2.8 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from anyio<5,>=3.4.0->starlette==0.19.1->fastapi>=0.61.1->dask-sql) (3.3)\r\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: heapdict in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from zict>=0.1.3->distributed==2022.05.0->dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (1.0.1)\r\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages (from jinja2->distributed==2022.05.0->dask[dataframe,distributed]<=2022.5.2,>=2022.3.0->dask-sql) (2.1.1)\r\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Installing collected packages: tzdata, jpype1, h11, uvicorn, starlette, pytz-deprecation-shim, tzlocal, fastapi, dask-sql\r\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Successfully installed dask-sql-2022.6.0 fastapi-0.79.0 h11-0.13.0 jpype1-1.4.0 pytz-deprecation-shim-0.1.0.post0 starlette-0.19.1 tzdata-2022.1 tzlocal-4.2 uvicorn-0.18.2\r\n"
     ]
    }
   ],
   "source": [
    "! pip install dask-sql"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Set up a Dask cluster\n",
    "\n",
    "Setting up a Dask [Cluster](https://docs.dask.org/en/latest/deploying.html) is optional, but can dramatically expand our options for distributed computation by giving us access to Dask workers on GPUs, remote machines, common cloud providers, and more).\n",
    "Additionally, connecting our cluster to a Dask [Client](https://distributed.dask.org/en/stable/client.html) will give us access to a dashboard, which can be used to monitor the progress of active computations and diagnose issues.\n",
    "\n",
    "For this notebook, we will create a local cluster and connect it to a client.\n",
    "Once the client has been created, a link will appear to its associated dashboard, which can be viewed throughout the following computations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:13:17.629522Z",
     "iopub.status.busy": "2022-07-27T19:13:17.629297Z",
     "iopub.status.idle": "2022-07-27T19:13:19.786783Z",
     "shell.execute_reply": "2022-07-27T19:13:19.785970Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n",
       "    <div style=\"margin-left: 48px;\">\n",
       "        <h3 style=\"margin-bottom: 0px;\">Client</h3>\n",
       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-2c014484-0de0-11ed-9c67-000d3a8f7959</p>\n",
       "        <table style=\"width: 100%; text-align: left;\">\n",
       "\n",
       "        <tr>\n",
       "        \n",
       "            <td style=\"text-align: left;\"><strong>Connection method:</strong> Cluster object</td>\n",
       "            <td style=\"text-align: left;\"><strong>Cluster type:</strong> distributed.LocalCluster</td>\n",
       "        \n",
       "        </tr>\n",
       "\n",
       "        \n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\"></td>\n",
       "            </tr>\n",
       "        \n",
       "\n",
       "        </table>\n",
       "\n",
       "        \n",
       "            <details>\n",
       "            <summary style=\"margin-bottom: 20px;\"><h3 style=\"display: inline;\">Cluster Info</h3></summary>\n",
       "            <div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output\">\n",
       "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\">\n",
       "    </div>\n",
       "    <div style=\"margin-left: 48px;\">\n",
       "        <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCluster</h3>\n",
       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">dec5a19f</p>\n",
       "        <table style=\"width: 100%; text-align: left;\">\n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Workers:</strong> 2\n",
       "                </td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Total threads:</strong> 4\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Total memory:</strong> 1.86 GiB\n",
       "                </td>\n",
       "            </tr>\n",
       "            \n",
       "            <tr>\n",
       "    <td style=\"text-align: left;\"><strong>Status:</strong> running</td>\n",
       "    <td style=\"text-align: left;\"><strong>Using processes:</strong> True</td>\n",
       "</tr>\n",
       "\n",
       "            \n",
       "        </table>\n",
       "\n",
       "        <details>\n",
       "            <summary style=\"margin-bottom: 20px;\">\n",
       "                <h3 style=\"display: inline;\">Scheduler Info</h3>\n",
       "            </summary>\n",
       "\n",
       "            <div style=\"\">\n",
       "    <div>\n",
       "        <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n",
       "        <div style=\"margin-left: 48px;\">\n",
       "            <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n",
       "            <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-93f0bc5c-2d81-4dc1-b2c8-72e6c702a5b9</p>\n",
       "            <table style=\"width: 100%; text-align: left;\">\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Comm:</strong> tcp://127.0.0.1:38331\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Workers:</strong> 2\n",
       "                    </td>\n",
       "                </tr>\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Total threads:</strong> 4\n",
       "                    </td>\n",
       "                </tr>\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Started:</strong> Just now\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Total memory:</strong> 1.86 GiB\n",
       "                    </td>\n",
       "                </tr>\n",
       "            </table>\n",
       "        </div>\n",
       "    </div>\n",
       "\n",
       "    <details style=\"margin-left: 48px;\">\n",
       "        <summary style=\"margin-bottom: 20px;\">\n",
       "            <h3 style=\"display: inline;\">Workers</h3>\n",
       "        </summary>\n",
       "\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 0</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:45347\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 2\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:37117/status\" target=\"_blank\">http://127.0.0.1:37117/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 0.93 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:43225\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /home/runner/work/dask-examples/dask-examples/dask-worker-space/worker-pd8kj694\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 1</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:36979\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 2\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:45797/status\" target=\"_blank\">http://127.0.0.1:45797/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 0.93 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:41733\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /home/runner/work/dask-examples/dask-examples/dask-worker-space/worker-e6d8x_if\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "\n",
       "    </details>\n",
       "</div>\n",
       "\n",
       "        </details>\n",
       "    </div>\n",
       "</div>\n",
       "            </details>\n",
       "        \n",
       "\n",
       "    </div>\n",
       "</div>"
      ],
      "text/plain": [
       "<Client: 'tcp://127.0.0.1:38331' processes=2 threads=4, memory=1.86 GiB>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from dask.distributed import Client\n",
    "\n",
    "client = Client(n_workers=2, threads_per_worker=2, memory_limit='1GB')\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a context\n",
    "\n",
    "A `dask_sql.Context` is the Python equivalent to a SQL database, serving as an interface to register all tables and functions used in SQL queries, as well as execute the queries themselves.\n",
    "In typical usage, a single `Context` is created and used for the duration of a Python script or notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:13:19.790696Z",
     "iopub.status.busy": "2022-07-27T19:13:19.790167Z",
     "iopub.status.idle": "2022-07-27T19:13:21.935652Z",
     "shell.execute_reply": "2022-07-27T19:13:21.934827Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages/dask_sql/java.py:39: UserWarning: You are running in a conda environment, but the JAVA_PATH is not using it. If this is by mistake, set $JAVA_HOME to /usr/share/miniconda3/envs/dask-examples, instead of /usr/lib/jvm/temurin-11-jdk-amd64.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from dask_sql import Context\n",
    "c = Context()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load and register data\n",
    "\n",
    "Once a `Context` has been created, there are a variety of ways to register tables in it.\n",
    "The simplest way to do this is through the `create_table` method, which accepts a variety of input types which Dask-SQL then uses to infer the table creation method.\n",
    "Supported input types include:\n",
    "\n",
    "- Dask / [Pandas](https://pandas.pydata.org/)-like dataframes\n",
    "- String locations of local or remote datasets\n",
    "- [Apache Hive](https://github.com/apache/hive) tables served through [PyHive](https://github.com/dropbox/PyHive) or [SQLAlchemy](https://www.sqlalchemy.org/)\n",
    "\n",
    "Input type can also be specified explicitly by providing a `format`.\n",
    "When being registered, tables can optionally be persisted into memory by passing `persist=True`, which can greatly speed up repeated queries on the same table at the cost of loading the entire table into memory.\n",
    "For more information, see [Data Loading and Input](https://dask-sql.readthedocs.io/en/latest/pages/data_input.html)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:13:21.939531Z",
     "iopub.status.busy": "2022-07-27T19:13:21.939086Z",
     "iopub.status.idle": "2022-07-27T19:13:22.170735Z",
     "shell.execute_reply": "2022-07-27T19:13:22.170137Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from dask.datasets import timeseries\n",
    "\n",
    "# register and persist a dask table\n",
    "ddf = timeseries()\n",
    "c.create_table(\"dask\", ddf, persist=True)\n",
    "\n",
    "# register a pandas table (implicitly converted to a dask table)\n",
    "df = pd.DataFrame({\"a\": [1, 2, 3]})\n",
    "c.create_table(\"pandas\", df)\n",
    "\n",
    "# register a table from local storage; kwargs are passed on to the underlying table creation method\n",
    "c.create_table(\n",
    "    \"local\",\n",
    "    \"surveys/data/2021-user-survey-results.csv.gz\",\n",
    "    format=\"csv\",\n",
    "    parse_dates=['Timestamp'],\n",
    "    blocksize=None\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Tables can also be registered through SQL `CREATE TABLE WITH` or `CREATE TABLE AS` statements, using the `sql` method."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:13:22.177427Z",
     "iopub.status.busy": "2022-07-27T19:13:22.174337Z",
     "iopub.status.idle": "2022-07-27T19:13:25.435787Z",
     "shell.execute_reply": "2022-07-27T19:13:25.435248Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/share/miniconda3/envs/dask-examples/lib/python3.9/site-packages/dask/dataframe/io/csv.py:533: UserWarning: Warning gzip compression does not support breaking apart files\n",
      "Please ensure that each individual file can fit in memory and\n",
      "use the keyword ``blocksize=None to remove this message``\n",
      "Setting ``blocksize=None``\n",
      "  warn(\n"
     ]
    }
   ],
   "source": [
    "# replace our table from local storage\n",
    "c.sql(\"\"\"\n",
    "    CREATE OR REPLACE TABLE\n",
    "        \"local\"\n",
    "    WITH (\n",
    "        location = 'surveys/data/2021-user-survey-results.csv.gz',\n",
    "        format = 'csv',\n",
    "        parse_dates = ARRAY [ 'Timestamp' ]\n",
    "    )\n",
    "\"\"\")\n",
    "\n",
    "# create a new table from a SQL query\n",
    "c.sql(\"\"\"\n",
    "    CREATE TABLE filtered AS (\n",
    "        SELECT id, name FROM dask WHERE name = 'Zelda'\n",
    "    )\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "All registered tables can be listed with a `SHOW TABLES` statement."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:13:25.440811Z",
     "iopub.status.busy": "2022-07-27T19:13:25.439548Z",
     "iopub.status.idle": "2022-07-27T19:13:25.564442Z",
     "shell.execute_reply": "2022-07-27T19:13:25.563864Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Table</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>dask</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>pandas</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>local</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>filtered</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Table\n",
       "0      dask\n",
       "1    pandas\n",
       "2     local\n",
       "3  filtered"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c.sql(\"SHOW TABLES FROM root\").compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Dask-SQL currently offers experimental GPU support, powered by the [RAPIDS](https://rapids.ai/) suite of open source GPU data science libraries.\n",
    "Input support is currently limited to Dask / Pandas-like dataframes and data in local/remote storage, and though most queries run without issue, users should expect some bugs or undefined behavior.\n",
    "To register a table and mark it for use on GPUs, `gpu=True` can be passed to a standard `create_table` call, or its equivalent `CREATE TABLE WITH` query (note that this requires [cuDF and Dask-cuDF](https://github.com/rapidsai/cudf)).\n",
    "\n",
    "```python\n",
    "# register a dask table for use on GPUs (not possible in this binder)\n",
    "c.create_table(\"gpu_dask\", ddf, gpu=True)\n",
    "\n",
    "# load in a table from disk using GPU-accelerated IO operations\n",
    "c.sql(\"\"\"\n",
    "    CREATE TABLE\n",
    "        \"gpu_local\"\n",
    "    WITH (\n",
    "        location = 'surveys/data/2021-user-survey-results.csv.gz',\n",
    "        format = 'csv',\n",
    "        parse_dates = ARRAY [ 'Timestamp' ],\n",
    "        gpu = True\n",
    "    )\n",
    "\"\"\")\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Query the data\n",
    "\n",
    "When the `sql` method is called, Dask-SQL hands the query off to Apache Calcite to convert into a relational algebra - essentially a list of SQL tasks that must be executed in order to get a result.\n",
    "The relational algebra of any query can be viewed directly using the `explain` method."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:13:25.569225Z",
     "iopub.status.busy": "2022-07-27T19:13:25.568065Z",
     "iopub.status.idle": "2022-07-27T19:13:25.882366Z",
     "shell.execute_reply": "2022-07-27T19:13:25.881795Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DaskProject(EXPR$0=[/(CAST(CASE(=($1, 0), null:DOUBLE, $0)):DECIMAL(19, 15), $1)]): rowcount = 10.0, cumulative cost = {122.5 rows, 111.0 cpu, 0.0 io}, id = 83\n",
      "  DaskAggregate(group=[{}], agg#0=[$SUM0($2)], agg#1=[COUNT($2)]): rowcount = 10.0, cumulative cost = {112.5 rows, 101.0 cpu, 0.0 io}, id = 82\n",
      "    DaskTableScan(table=[[root, dask]]): rowcount = 100.0, cumulative cost = {100.0 rows, 101.0 cpu, 0.0 io}, id = 77\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(c.explain(\"SELECT AVG(x) FROM dask\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "From here, this relational algebra is then converted into a Dask computation graph, which ultimately returns (or in the case of `CREATE TABLE` statements, implicitly assigns) a Dask dataframe."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:13:25.888366Z",
     "iopub.status.busy": "2022-07-27T19:13:25.887305Z",
     "iopub.status.idle": "2022-07-27T19:13:25.995742Z",
     "shell.execute_reply": "2022-07-27T19:13:25.995180Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><strong>Dask DataFrame Structure:</strong></div>\n",
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>AVG(\"dask\".\"x\")</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>npartitions=1</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>float64</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>\n",
       "<div>Dask Name: rename, 107 tasks</div>"
      ],
      "text/plain": [
       "Dask DataFrame Structure:\n",
       "              AVG(\"dask\".\"x\")\n",
       "npartitions=1                \n",
       "                      float64\n",
       "                          ...\n",
       "Dask Name: rename, 107 tasks"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c.sql(\"SELECT AVG(x) FROM dask\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Dask dataframes are lazy, meaning that at the time of their creation, none of their dependent tasks have been executed yet.\n",
    "To actually execute these tasks and get a result, we must call `compute`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:13:26.001551Z",
     "iopub.status.busy": "2022-07-27T19:13:26.000018Z",
     "iopub.status.idle": "2022-07-27T19:13:26.260376Z",
     "shell.execute_reply": "2022-07-27T19:13:26.259738Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>AVG(\"dask\".\"x\")</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.000302</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   AVG(\"dask\".\"x\")\n",
       "0        -0.000302"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c.sql(\"SELECT AVG(x) FROM dask\").compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Looking at the dashboard, we can see that executing this query has triggered some Dask computations.\n",
    "\n",
    "Because the return value of a query is a Dask dataframe, it is also possible to do follow-up operations on it using Dask's dataframe API.\n",
    "This can be useful if we want to perform some complex operations on a dataframe that are not possible through Dask, then follow up with some simpler operations that can easily be expressed through the dataframe API."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:13:26.263591Z",
     "iopub.status.busy": "2022-07-27T19:13:26.263168Z",
     "iopub.status.idle": "2022-07-27T19:13:32.834744Z",
     "shell.execute_reply": "2022-07-27T19:13:32.833848Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Alice</th>\n",
       "      <td>-249.383593</td>\n",
       "      <td>0.001241</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bob</th>\n",
       "      <td>160.839932</td>\n",
       "      <td>0.000056</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Charlie</th>\n",
       "      <td>-77.458027</td>\n",
       "      <td>-0.001389</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Dan</th>\n",
       "      <td>141.385152</td>\n",
       "      <td>-0.001548</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Edith</th>\n",
       "      <td>-33.965445</td>\n",
       "      <td>-0.000867</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Frank</th>\n",
       "      <td>31.380364</td>\n",
       "      <td>-0.000966</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>George</th>\n",
       "      <td>291.711276</td>\n",
       "      <td>-0.002320</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Hannah</th>\n",
       "      <td>76.193943</td>\n",
       "      <td>-0.001283</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ingrid</th>\n",
       "      <td>69.657261</td>\n",
       "      <td>-0.001849</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Jerry</th>\n",
       "      <td>-35.406853</td>\n",
       "      <td>-0.002052</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Kevin</th>\n",
       "      <td>-199.853191</td>\n",
       "      <td>0.000221</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Laura</th>\n",
       "      <td>98.363175</td>\n",
       "      <td>-0.001911</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Michael</th>\n",
       "      <td>-100.410534</td>\n",
       "      <td>0.004294</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Norbert</th>\n",
       "      <td>189.525214</td>\n",
       "      <td>-0.000738</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Oliver</th>\n",
       "      <td>-251.094045</td>\n",
       "      <td>-0.000164</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Patricia</th>\n",
       "      <td>-37.815014</td>\n",
       "      <td>0.003536</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Quinn</th>\n",
       "      <td>-137.963034</td>\n",
       "      <td>-0.001342</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ray</th>\n",
       "      <td>-274.337917</td>\n",
       "      <td>0.004108</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Sarah</th>\n",
       "      <td>-237.457164</td>\n",
       "      <td>0.001387</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Tim</th>\n",
       "      <td>67.416750</td>\n",
       "      <td>0.001667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ursula</th>\n",
       "      <td>-188.578720</td>\n",
       "      <td>0.002330</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Victor</th>\n",
       "      <td>-60.309784</td>\n",
       "      <td>-0.000196</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Wendy</th>\n",
       "      <td>128.743367</td>\n",
       "      <td>0.000112</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Xavier</th>\n",
       "      <td>-158.350232</td>\n",
       "      <td>-0.001734</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Yvonne</th>\n",
       "      <td>43.986670</td>\n",
       "      <td>0.001555</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Zelda</th>\n",
       "      <td>-38.438229</td>\n",
       "      <td>0.001045</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   x         y\n",
       "name                          \n",
       "Alice    -249.383593  0.001241\n",
       "Bob       160.839932  0.000056\n",
       "Charlie   -77.458027 -0.001389\n",
       "Dan       141.385152 -0.001548\n",
       "Edith     -33.965445 -0.000867\n",
       "Frank      31.380364 -0.000966\n",
       "George    291.711276 -0.002320\n",
       "Hannah     76.193943 -0.001283\n",
       "Ingrid     69.657261 -0.001849\n",
       "Jerry     -35.406853 -0.002052\n",
       "Kevin    -199.853191  0.000221\n",
       "Laura      98.363175 -0.001911\n",
       "Michael  -100.410534  0.004294\n",
       "Norbert   189.525214 -0.000738\n",
       "Oliver   -251.094045 -0.000164\n",
       "Patricia  -37.815014  0.003536\n",
       "Quinn    -137.963034 -0.001342\n",
       "Ray      -274.337917  0.004108\n",
       "Sarah    -237.457164  0.001387\n",
       "Tim        67.416750  0.001667\n",
       "Ursula   -188.578720  0.002330\n",
       "Victor    -60.309784 -0.000196\n",
       "Wendy     128.743367  0.000112\n",
       "Xavier   -158.350232 -0.001734\n",
       "Yvonne     43.986670  0.001555\n",
       "Zelda     -38.438229  0.001045"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# perform a multi-column sort that isn't possible in Dask\n",
    "res = c.sql(\"\"\"\n",
    "    SELECT * FROM dask ORDER BY name ASC, id DESC, x ASC\n",
    "\"\"\")\n",
    "\n",
    "# now do some follow groupby aggregations\n",
    "res.groupby(\"name\").agg({\"x\": \"sum\", \"y\": \"mean\"}).compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Custom functions and aggregations\n",
    "\n",
    "When standard SQL functionality is insufficient, it is possible to register custom functions for use in queries.\n",
    "These functions can be classified as one of the following:\n",
    "\n",
    "- Column-wise functions\n",
    "- Row-wise functions\n",
    "- Aggregations\n",
    "\n",
    "### Column-wise functions\n",
    "\n",
    "Column-wise functions can take columns or literal values as input and return a column of an identical length.\n",
    "Column-wise functions can be registered in a `Context` using the `register_function` method."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:13:32.838458Z",
     "iopub.status.busy": "2022-07-27T19:13:32.838129Z",
     "iopub.status.idle": "2022-07-27T19:13:32.845697Z",
     "shell.execute_reply": "2022-07-27T19:13:32.844999Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "def f(x):\n",
    "    return x ** 2\n",
    "\n",
    "c.register_function(f, \"f\", [(\"x\", np.float64)], np.float64)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Function registration requires the following inputs:\n",
    "\n",
    "- A callable function\n",
    "- A name for the function to be referred to in queries\n",
    "- A list of tuples, representing the input variables and their respective types, which can be either Pandas or [NumPy](https://numpy.org/) types\n",
    "- A type for the output column\n",
    "\n",
    "Once a function has been registered, it can be called like any other standard SQL function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:13:32.848877Z",
     "iopub.status.busy": "2022-07-27T19:13:32.848394Z",
     "iopub.status.idle": "2022-07-27T19:13:33.606975Z",
     "shell.execute_reply": "2022-07-27T19:13:33.605935Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>\"F\"(\"dask\".\"x\")</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>timestamp</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:00</th>\n",
       "      <td>0.408645</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:01</th>\n",
       "      <td>0.497901</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:02</th>\n",
       "      <td>0.064370</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:03</th>\n",
       "      <td>0.421497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:04</th>\n",
       "      <td>0.304109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:55</th>\n",
       "      <td>0.691240</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:56</th>\n",
       "      <td>0.499867</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:57</th>\n",
       "      <td>0.049903</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:58</th>\n",
       "      <td>0.004089</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:59</th>\n",
       "      <td>0.490209</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2592000 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     \"F\"(\"dask\".\"x\")\n",
       "timestamp                           \n",
       "2000-01-01 00:00:00         0.408645\n",
       "2000-01-01 00:00:01         0.497901\n",
       "2000-01-01 00:00:02         0.064370\n",
       "2000-01-01 00:00:03         0.421497\n",
       "2000-01-01 00:00:04         0.304109\n",
       "...                              ...\n",
       "2000-01-30 23:59:55         0.691240\n",
       "2000-01-30 23:59:56         0.499867\n",
       "2000-01-30 23:59:57         0.049903\n",
       "2000-01-30 23:59:58         0.004089\n",
       "2000-01-30 23:59:59         0.490209\n",
       "\n",
       "[2592000 rows x 1 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c.sql(\"SELECT F(x) FROM dask\").compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Row-wise functions\n",
    "\n",
    "In some cases, it may be easier to write a custom function that processes a dict-like `row` object - otherwise known as a row-wise function.\n",
    "These functions can also be registered using `register_function` by passing `row_udf=True`, and used in the same manner as a column-wise function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:13:33.610793Z",
     "iopub.status.busy": "2022-07-27T19:13:33.610149Z",
     "iopub.status.idle": "2022-07-27T19:14:03.304712Z",
     "shell.execute_reply": "2022-07-27T19:14:03.303925Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>\"G\"(\"dask\".\"x\", \"dask\".\"y\")</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>timestamp</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:00</th>\n",
       "      <td>0.446911</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:01</th>\n",
       "      <td>0.900878</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:02</th>\n",
       "      <td>0.052787</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:03</th>\n",
       "      <td>0.454549</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:04</th>\n",
       "      <td>1.157125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:55</th>\n",
       "      <td>1.603634</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:56</th>\n",
       "      <td>1.389727</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:57</th>\n",
       "      <td>0.671131</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:58</th>\n",
       "      <td>0.773367</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:59</th>\n",
       "      <td>0.023842</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2592000 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     \"G\"(\"dask\".\"x\", \"dask\".\"y\")\n",
       "timestamp                                       \n",
       "2000-01-01 00:00:00                     0.446911\n",
       "2000-01-01 00:00:01                     0.900878\n",
       "2000-01-01 00:00:02                     0.052787\n",
       "2000-01-01 00:00:03                     0.454549\n",
       "2000-01-01 00:00:04                     1.157125\n",
       "...                                          ...\n",
       "2000-01-30 23:59:55                     1.603634\n",
       "2000-01-30 23:59:56                     1.389727\n",
       "2000-01-30 23:59:57                     0.671131\n",
       "2000-01-30 23:59:58                     0.773367\n",
       "2000-01-30 23:59:59                     0.023842\n",
       "\n",
       "[2592000 rows x 1 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def g(row):\n",
    "    if row[\"x\"] > row[\"y\"]:\n",
    "        return row[\"x\"] - row[\"y\"]\n",
    "    return row[\"y\"] - row[\"x\"]\n",
    "\n",
    "c.register_function(g, \"g\", [(\"x\", np.float64), (\"y\", np.float64)], np.float64, row_udf=True)\n",
    "\n",
    "c.sql(\"SELECT G(x, y) FROM dask\").compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that unlike column-wise functions, which are called directly using specified columns and literals as input, row-wise functions are called using `apply`, which can have unpredictable performance depending on the underlying dataframe library (e.g. Pandas, cuDF) and the function itself.\n",
    "\n",
    "### Aggregations\n",
    "\n",
    "Aggregations take a single column as input and return a single value - thus, they can only be used to reduce the results of a `GROUP BY` query.\n",
    "Aggregations can be registered using the `register_aggregation` method, which is functionally similar to `register_function` but takes a Dask [Aggregation](https://docs.dask.org/en/latest/dataframe-groupby.html#aggregate) as input instead of a callable function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:14:03.308454Z",
     "iopub.status.busy": "2022-07-27T19:14:03.307924Z",
     "iopub.status.idle": "2022-07-27T19:14:03.641638Z",
     "shell.execute_reply": "2022-07-27T19:14:03.640519Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>\"MY_SUM\"(\"dask\".\"x\")</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-781.618678</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   \"MY_SUM\"(\"dask\".\"x\")\n",
       "0           -781.618678"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import dask.dataframe as dd\n",
    "\n",
    "my_sum = dd.Aggregation(\"my_sum\", lambda x: x.sum(), lambda x: x.sum())\n",
    "\n",
    "c.register_aggregation(my_sum, \"my_sum\", [(\"x\", np.float64)], np.float64)\n",
    "\n",
    "c.sql(\"SELECT MY_SUM(x) FROM dask\").compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Machine learning in SQL\n",
    "\n",
    "Dask-SQL has support for both model training and prediction, enabling machine learning workflows with a flexible combination of both Python and SQL.\n",
    "A model can be registered in a `Context` either through the `register_model` method or a `CREATE MODEL` statement."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:14:03.645227Z",
     "iopub.status.busy": "2022-07-27T19:14:03.644789Z",
     "iopub.status.idle": "2022-07-27T19:14:05.128098Z",
     "shell.execute_reply": "2022-07-27T19:14:05.127026Z"
    }
   },
   "outputs": [],
   "source": [
    "from dask_ml.linear_model import LinearRegression\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "\n",
    "# create a dask-ml model and train it\n",
    "model = GradientBoostingClassifier()\n",
    "data = c.sql(\"SELECT x, y, x * y > 0 AS target FROM dask LIMIT 50\")\n",
    "model.fit(data[[\"x\", \"y\"]], data[\"target\"])\n",
    "\n",
    "# register this model in the context\n",
    "c.register_model(\"python_model\", model, training_columns=[\"x\", \"y\"])\n",
    "\n",
    "# create and train a model directly from SQL\n",
    "c.sql(\"\"\"\n",
    "    CREATE MODEL sql_model WITH (\n",
    "        model_class = 'sklearn.ensemble.GradientBoostingClassifier',\n",
    "        wrap_predict = True,\n",
    "        target_column = 'target'\n",
    "    ) AS (\n",
    "        SELECT x, y, x * y > 0 AS target\n",
    "        FROM dask\n",
    "        LIMIT 50\n",
    "    )\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Registered models must follow the [scikit-learn](https://scikit-learn.org/stable/index.html) interface by implementing a `predict` method.\n",
    "As with tables, all registered models can be listed with a `SHOW MODEL` statement."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:14:05.131490Z",
     "iopub.status.busy": "2022-07-27T19:14:05.131070Z",
     "iopub.status.idle": "2022-07-27T19:14:05.178606Z",
     "shell.execute_reply": "2022-07-27T19:14:05.177580Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Models</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>python_model</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sql_model</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         Models\n",
       "0  python_model\n",
       "1     sql_model"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c.sql(\"SHOW MODELS\").compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "From here, the models can be used to make predictions using the `PREDICT` keyword as part of a `SELECT` query."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-27T19:14:05.181956Z",
     "iopub.status.busy": "2022-07-27T19:14:05.181600Z",
     "iopub.status.idle": "2022-07-27T19:14:10.973388Z",
     "shell.execute_reply": "2022-07-27T19:14:10.972696Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>actual</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>timestamp</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:50</th>\n",
       "      <td>-0.508541</td>\n",
       "      <td>-0.018462</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:51</th>\n",
       "      <td>0.652920</td>\n",
       "      <td>-0.847008</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:52</th>\n",
       "      <td>-0.779734</td>\n",
       "      <td>0.117797</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:53</th>\n",
       "      <td>0.360605</td>\n",
       "      <td>-0.965205</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:54</th>\n",
       "      <td>-0.475373</td>\n",
       "      <td>0.652320</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:55</th>\n",
       "      <td>-0.831409</td>\n",
       "      <td>0.772225</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:56</th>\n",
       "      <td>-0.707013</td>\n",
       "      <td>0.682714</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:57</th>\n",
       "      <td>-0.223391</td>\n",
       "      <td>0.447740</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:58</th>\n",
       "      <td>0.063943</td>\n",
       "      <td>-0.709424</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-30 23:59:59</th>\n",
       "      <td>-0.700149</td>\n",
       "      <td>-0.723991</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2591950 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                            x         y  actual  target\n",
       "timestamp                                              \n",
       "2000-01-01 00:00:50 -0.508541 -0.018462    True    True\n",
       "2000-01-01 00:00:51  0.652920 -0.847008   False   False\n",
       "2000-01-01 00:00:52 -0.779734  0.117797   False   False\n",
       "2000-01-01 00:00:53  0.360605 -0.965205   False   False\n",
       "2000-01-01 00:00:54 -0.475373  0.652320   False   False\n",
       "...                       ...       ...     ...     ...\n",
       "2000-01-30 23:59:55 -0.831409  0.772225   False   False\n",
       "2000-01-30 23:59:56 -0.707013  0.682714   False   False\n",
       "2000-01-30 23:59:57 -0.223391  0.447740   False   False\n",
       "2000-01-30 23:59:58  0.063943 -0.709424   False   False\n",
       "2000-01-30 23:59:59 -0.700149 -0.723991    True    True\n",
       "\n",
       "[2591950 rows x 4 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c.sql(\"\"\"\n",
    "    SELECT * FROM PREDICT (\n",
    "        MODEL sql_model,\n",
    "        SELECT x, y, x * y > 0 AS actual FROM dask\n",
    "        OFFSET 50\n",
    "    )\n",
    "\"\"\").compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}