{ "cells": [ { "cell_type": "markdown", "id": "23283830-032b-484a-b599-66ba8e7c2001", "metadata": {}, "source": [ "# HTML parse\n", "\n", "in this notebook:\n", "* prepare jupyter environment\n", "* download [cable bible](https://amiaopensource.github.io/cable-bible/) index.html file\n", "* parse html" ] }, { "cell_type": "markdown", "id": "ae76266b-b137-43ec-80f7-3f2c992215d9", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## prepare jupyter environment" ] }, { "cell_type": "code", "execution_count": 1, "id": "6360c8e3-8958-41c5-938a-f713af2ae715", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Note: you may need to restart the kernel to use updated packages.\n", "Note: you may need to restart the kernel to use updated packages.\n" ] }, { "data": { "application/javascript": [ "if (!(\"Notification\" in window)) {\n", " alert(\"This browser does not support desktop notifications, so the %%notify magic will not work.\");\n", "} else if (Notification.permission !== 'granted' && Notification.permission !== 'denied') {\n", " Notification.requestPermission(function (permission) {\n", " if(!('permission' in Notification)) {\n", " Notification.permission = permission;\n", " }\n", " })\n", "}\n", "\n", "if(!window.jQuery) {\n", " var jq = document.createElement('script');\n", " jq.src = \"//ajax.googleapis.com/ajax/libs/jquery/2.1.4/jquery.min.js\";\n", " document.getElementsByTagName('head')[0].appendChild(jq);\n", "}\n", "\n", "// Detect if the window is out of focus.\n", "window.jupyterNotifyIsInBackground = undefined;\n", "(function() {\n", " // Check document.hidden support\n", " var hidden;\n", " if (typeof document.hidden !== \"undefined\") { // Opera 12.10 and Firefox 18 and later support\n", " hidden = \"hidden\";\n", " } else if (typeof document.msHidden !== \"undefined\") {\n", " hidden = \"msHidden\";\n", " } else if (typeof document.webkitHidden !== \"undefined\") {\n", " hidden = \"webkitHidden\";\n", " }\n", "\n", " // Set initial background state\n", " if (document[hidden]) {\n", " window.jupyterNotifyIsInBackground = true;\n", " } else {\n", " window.jupyterNotifyIsInBackground = false;\n", " }\n", "\n", " window.addEventListener('blur', function() { window.jupyterNotifyIsInBackground = true; }, false);\n", " window.addEventListener('focus', function() { window.jupyterNotifyIsInBackground = false; }, false);\n", "})();\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%pip install jupyterlab-vim -q\n", "%pip install git+https://github.com/cphyc/jupyter-notify.git -q\n", "#%reload_ext jupyternotify\n", "%load_ext jupyternotify" ] }, { "cell_type": "code", "execution_count": 2, "id": "d5946431-9f33-4caa-835b-ab09fb73d0e2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "%pip install pandas -q \\\n", " html5lib -q \\\n", " beautifulsoup4 -q" ] }, { "cell_type": "markdown", "id": "bcbc54b8-7e77-446c-bb9d-4753bb61bdf9", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## download cable bible" ] }, { "cell_type": "code", "execution_count": 3, "id": "c245e2c1-a5f1-4fa2-9037-7e1773208854", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mkdir: cannot create directory ‘/tmp/scrape-demo’: File exists\n", " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 150k 100 150k 0 0 1089k 0 --:--:-- --:--:-- --:--:-- 1089k\n", "\n", "\n" ] } ], "source": [ "!mkdir /tmp/scrape-demo\n", "!curl \\\n", " https://raw.githubusercontent.com/amiaopensource/cable-bible/master/index.html \\\n", " -o /tmp/scrape-demo/index.html\n", "!head -2 /tmp/scrape-demo/index.html\n", "#!cp /tmp/scrape-demo/index.html .\n", "#!code ." ] }, { "cell_type": "markdown", "id": "2e7e4c6d-6d1b-44fa-8527-dd847cd63895", "metadata": { "tags": [] }, "source": [ "## parse html" ] }, { "cell_type": "code", "execution_count": 4, "id": "b28f8b7c-05c0-462f-93ab-10ff0b35176f", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "from bs4 import BeautifulSoup\n", "html = Path('/tmp/scrape-demo/index.html').read_text()\n", "soup = BeautifulSoup(html, 'html.parser')" ] }, { "cell_type": "code", "execution_count": 5, "id": "81767551-c9b6-4e98-80cf-1fc49b3a4262", "metadata": {}, "outputs": [], "source": [ "central_div = soup.find(\"div\", {\"class\": \"well col-md-8 col-md-offset-0\"})" ] }, { "cell_type": "code", "execution_count": 6, "id": "e134d6be-e32a-4cd7-91da-9ba359870e3f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['Video', 'Audio', 'Data', 'Power'])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = {\n", " e.h2.string: e for e\n", " in central_div.find_all('div', id=lambda x: x != 'table_of_contents')\n", " if e.h2 is not None}\n", "a.keys()" ] }, { "cell_type": "code", "execution_count": 7, "id": "80ba6ebb-d679-4767-8917-348fe441f185", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['Analog Video', 'Digital Video', 'Integrated Video'])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "b = {e.h3.string: e for e in\n", " a['Video'].find_all(\"div\", {\"class\": \"well\"})\n", " if e.find_all('h3', id=lambda x: x is not None)}\n", "b.keys()" ] }, { "cell_type": "code", "execution_count": 8, "id": "d414e989-ba00-4a60-a357-571286f36276", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['composite', 'component_ypbpr', 's-video', 'yc-688', 'rgbs', 'rgbvh'])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c = {\n", " e.h4['id']:e for e in\n", " b['Analog Video'].find_all(\"div\", {\"class\": \"well\"})\n", " if e.h4 is not None}\n", "c.keys()" ] }, { "cell_type": "code", "execution_count": 9, "id": "97b1f814-c16e-42d3-96ff-b2faa27d5830", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['Composite RCA', 'Composite BNC', 'Composite UHF', 'Composite F-Type', 'Composite Video Patch (MUSA)', 'Composite 8-pin EIAJ', 'Composite SCART'])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d = {e.h3.string: e.p.string for e\n", " in c['composite'].find_all(\"div\", {\"class\": \"modal fade\"})}\n", "d.keys()" ] }, { "cell_type": "code", "execution_count": 12, "id": "2fca9ace-a92f-4eda-adbe-d20c845f2cd5", "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "$(document).ready(\n", " function() {\n", " function appendUniqueDiv(){\n", " // append a div with our uuid so we can check that it's already\n", " // been sent and avoid duplicates on page reload\n", " var notifiedDiv = document.createElement(\"div\")\n", " notifiedDiv.id = \"53b8a038-c6fd-4ea1-b291-5403fb3b742d\"\n", " element.append(notifiedDiv)\n", " }\n", "\n", " // only send notifications if the pageload is complete; this will\n", " // help stop extra notifications when a saved notebook is loaded,\n", " // which during testing gives us state \"interactive\", not \"complete\"\n", " if (document.readyState === 'complete') {\n", " // check for the div that signifies that the notification\n", " // was already sent\n", " if (document.getElementById(\"53b8a038-c6fd-4ea1-b291-5403fb3b742d\") === null) {\n", " var notificationPayload = {\"requireInteraction\": false, \"icon\": \"/static/base/images/favicon.ico\", \"body\": \"Cell execution has finished!\", \"only_in_background\": false};\n", "\n", " // We have a notification but the window is active\n", " if (notificationPayload.only_in_background && !window.jupyterNotifyIsInBackground) {\n", " appendUniqueDiv();\n", " return;\n", " }\n", " if (Notification.permission !== 'denied') {\n", " if (Notification.permission !== 'granted') { \n", " Notification.requestPermission(function (permission) {\n", " if(!('permission' in Notification)) {\n", " Notification.permission = permission\n", " }\n", " })\n", " }\n", " if (Notification.permission === 'granted') {\n", " var notification = new Notification(\"Jupyter Notebook\", notificationPayload)\n", " appendUniqueDiv()\n", " notification.onclick = function () {\n", " window.focus();\n", " this.close();\n", " };\n", " } \n", " } \n", " }\n", " }\n", " }\n", ")\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%notify" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }