diff --git a/notebooks/html-parse.ipynb b/notebooks/html-parse.ipynb new file mode 100644 index 0000000..af5a7c4 --- /dev/null +++ b/notebooks/html-parse.ipynb @@ -0,0 +1,378 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "23283830-032b-484a-b599-66ba8e7c2001", + "metadata": {}, + "source": [ + "# HTML parse\n", + "\n", + "in this notebook:\n", + "* prepare jupyter environment\n", + "* download [cable bible](https://amiaopensource.github.io/cable-bible/) index.html file\n", + "* parse html" + ] + }, + { + "cell_type": "markdown", + "id": "ae76266b-b137-43ec-80f7-3f2c992215d9", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "## prepare jupyter environment" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6360c8e3-8958-41c5-938a-f713af2ae715", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + }, + { + "data": { + "application/javascript": [ + "if (!(\"Notification\" in window)) {\n", + " alert(\"This browser does not support desktop notifications, so the %%notify magic will not work.\");\n", + "} else if (Notification.permission !== 'granted' && Notification.permission !== 'denied') {\n", + " Notification.requestPermission(function (permission) {\n", + " if(!('permission' in Notification)) {\n", + " Notification.permission = permission;\n", + " }\n", + " })\n", + "}\n", + "\n", + "if(!window.jQuery) {\n", + " var jq = document.createElement('script');\n", + " jq.src = \"//ajax.googleapis.com/ajax/libs/jquery/2.1.4/jquery.min.js\";\n", + " document.getElementsByTagName('head')[0].appendChild(jq);\n", + "}\n", + "\n", + "// Detect if the window is out of focus.\n", + "window.jupyterNotifyIsInBackground = undefined;\n", + "(function() {\n", + " // Check document.hidden support\n", + " var hidden;\n", + " if (typeof document.hidden !== \"undefined\") { // Opera 12.10 and Firefox 18 and later support\n", + " hidden = \"hidden\";\n", + " } else if (typeof document.msHidden !== \"undefined\") {\n", + " hidden = \"msHidden\";\n", + " } else if (typeof document.webkitHidden !== \"undefined\") {\n", + " hidden = \"webkitHidden\";\n", + " }\n", + "\n", + " // Set initial background state\n", + " if (document[hidden]) {\n", + " window.jupyterNotifyIsInBackground = true;\n", + " } else {\n", + " window.jupyterNotifyIsInBackground = false;\n", + " }\n", + "\n", + " window.addEventListener('blur', function() { window.jupyterNotifyIsInBackground = true; }, false);\n", + " window.addEventListener('focus', function() { window.jupyterNotifyIsInBackground = false; }, false);\n", + "})();\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%pip install jupyterlab-vim -q\n", + "%pip install git+https://github.com/cphyc/jupyter-notify.git -q\n", + "#%reload_ext jupyternotify\n", + "%load_ext jupyternotify" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d5946431-9f33-4caa-835b-ab09fb73d0e2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install pandas -q \\\n", + " html5lib -q \\\n", + " beautifulsoup4 -q" + ] + }, + { + "cell_type": "markdown", + "id": "bcbc54b8-7e77-446c-bb9d-4753bb61bdf9", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "## download cable bible" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c245e2c1-a5f1-4fa2-9037-7e1773208854", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mkdir: cannot create directory ‘/tmp/scrape-demo’: File exists\n", + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "100 150k 100 150k 0 0 1089k 0 --:--:-- --:--:-- --:--:-- 1089k\n", + "\n", + "\n" + ] + } + ], + "source": [ + "!mkdir /tmp/scrape-demo\n", + "!curl \\\n", + " https://raw.githubusercontent.com/amiaopensource/cable-bible/master/index.html \\\n", + " -o /tmp/scrape-demo/index.html\n", + "!head -2 /tmp/scrape-demo/index.html\n", + "#!cp /tmp/scrape-demo/index.html .\n", + "#!code ." + ] + }, + { + "cell_type": "markdown", + "id": "2e7e4c6d-6d1b-44fa-8527-dd847cd63895", + "metadata": { + "tags": [] + }, + "source": [ + "## parse html" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b28f8b7c-05c0-462f-93ab-10ff0b35176f", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from bs4 import BeautifulSoup\n", + "html = Path('/tmp/scrape-demo/index.html').read_text()\n", + "soup = BeautifulSoup(html, 'html.parser')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "81767551-c9b6-4e98-80cf-1fc49b3a4262", + "metadata": {}, + "outputs": [], + "source": [ + "central_div = soup.find(\"div\", {\"class\": \"well col-md-8 col-md-offset-0\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e134d6be-e32a-4cd7-91da-9ba359870e3f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['Video', 'Audio', 'Data', 'Power'])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a = {\n", + " e.h2.string: e for e\n", + " in central_div.find_all('div', id=lambda x: x != 'table_of_contents')\n", + " if e.h2 is not None}\n", + "a.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "80ba6ebb-d679-4767-8917-348fe441f185", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['Analog Video', 'Digital Video', 'Integrated Video'])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "b = {e.h3.string: e for e in\n", + " a['Video'].find_all(\"div\", {\"class\": \"well\"})\n", + " if e.find_all('h3', id=lambda x: x is not None)}\n", + "b.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d414e989-ba00-4a60-a357-571286f36276", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['composite', 'component_ypbpr', 's-video', 'yc-688', 'rgbs', 'rgbvh'])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c = {\n", + " e.h4['id']:e for e in\n", + " b['Analog Video'].find_all(\"div\", {\"class\": \"well\"})\n", + " if e.h4 is not None}\n", + "c.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "97b1f814-c16e-42d3-96ff-b2faa27d5830", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['Composite RCA', 'Composite BNC', 'Composite UHF', 'Composite F-Type', 'Composite Video Patch (MUSA)', 'Composite 8-pin EIAJ', 'Composite SCART'])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d = {e.h3.string: e.p.string for e\n", + " in c['composite'].find_all(\"div\", {\"class\": \"modal fade\"})}\n", + "d.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2fca9ace-a92f-4eda-adbe-d20c845f2cd5", + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "$(document).ready(\n", + " function() {\n", + " function appendUniqueDiv(){\n", + " // append a div with our uuid so we can check that it's already\n", + " // been sent and avoid duplicates on page reload\n", + " var notifiedDiv = document.createElement(\"div\")\n", + " notifiedDiv.id = \"53b8a038-c6fd-4ea1-b291-5403fb3b742d\"\n", + " element.append(notifiedDiv)\n", + " }\n", + "\n", + " // only send notifications if the pageload is complete; this will\n", + " // help stop extra notifications when a saved notebook is loaded,\n", + " // which during testing gives us state \"interactive\", not \"complete\"\n", + " if (document.readyState === 'complete') {\n", + " // check for the div that signifies that the notification\n", + " // was already sent\n", + " if (document.getElementById(\"53b8a038-c6fd-4ea1-b291-5403fb3b742d\") === null) {\n", + " var notificationPayload = {\"requireInteraction\": false, \"icon\": \"/static/base/images/favicon.ico\", \"body\": \"Cell execution has finished!\", \"only_in_background\": false};\n", + "\n", + " // We have a notification but the window is active\n", + " if (notificationPayload.only_in_background && !window.jupyterNotifyIsInBackground) {\n", + " appendUniqueDiv();\n", + " return;\n", + " }\n", + " if (Notification.permission !== 'denied') {\n", + " if (Notification.permission !== 'granted') { \n", + " Notification.requestPermission(function (permission) {\n", + " if(!('permission' in Notification)) {\n", + " Notification.permission = permission\n", + " }\n", + " })\n", + " }\n", + " if (Notification.permission === 'granted') {\n", + " var notification = new Notification(\"Jupyter Notebook\", notificationPayload)\n", + " appendUniqueDiv()\n", + " notification.onclick = function () {\n", + " window.focus();\n", + " this.close();\n", + " };\n", + " } \n", + " } \n", + " }\n", + " }\n", + " }\n", + ")\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%notify" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}