add html parsing notebook

This commit is contained in:
DannyDannyDanny 2022-10-02 01:06:26 +02:00
parent 47ffcda44e
commit a338bfaace

378
notebooks/html-parse.ipynb Normal file
View file

@ -0,0 +1,378 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "23283830-032b-484a-b599-66ba8e7c2001",
"metadata": {},
"source": [
"# HTML parse\n",
"\n",
"in this notebook:\n",
"* prepare jupyter environment\n",
"* download [cable bible](https://amiaopensource.github.io/cable-bible/) index.html file\n",
"* parse html"
]
},
{
"cell_type": "markdown",
"id": "ae76266b-b137-43ec-80f7-3f2c992215d9",
"metadata": {
"jp-MarkdownHeadingCollapsed": true,
"tags": []
},
"source": [
"## prepare jupyter environment"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "6360c8e3-8958-41c5-938a-f713af2ae715",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Note: you may need to restart the kernel to use updated packages.\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
},
{
"data": {
"application/javascript": [
"if (!(\"Notification\" in window)) {\n",
" alert(\"This browser does not support desktop notifications, so the %%notify magic will not work.\");\n",
"} else if (Notification.permission !== 'granted' && Notification.permission !== 'denied') {\n",
" Notification.requestPermission(function (permission) {\n",
" if(!('permission' in Notification)) {\n",
" Notification.permission = permission;\n",
" }\n",
" })\n",
"}\n",
"\n",
"if(!window.jQuery) {\n",
" var jq = document.createElement('script');\n",
" jq.src = \"//ajax.googleapis.com/ajax/libs/jquery/2.1.4/jquery.min.js\";\n",
" document.getElementsByTagName('head')[0].appendChild(jq);\n",
"}\n",
"\n",
"// Detect if the window is out of focus.\n",
"window.jupyterNotifyIsInBackground = undefined;\n",
"(function() {\n",
" // Check document.hidden support\n",
" var hidden;\n",
" if (typeof document.hidden !== \"undefined\") { // Opera 12.10 and Firefox 18 and later support\n",
" hidden = \"hidden\";\n",
" } else if (typeof document.msHidden !== \"undefined\") {\n",
" hidden = \"msHidden\";\n",
" } else if (typeof document.webkitHidden !== \"undefined\") {\n",
" hidden = \"webkitHidden\";\n",
" }\n",
"\n",
" // Set initial background state\n",
" if (document[hidden]) {\n",
" window.jupyterNotifyIsInBackground = true;\n",
" } else {\n",
" window.jupyterNotifyIsInBackground = false;\n",
" }\n",
"\n",
" window.addEventListener('blur', function() { window.jupyterNotifyIsInBackground = true; }, false);\n",
" window.addEventListener('focus', function() { window.jupyterNotifyIsInBackground = false; }, false);\n",
"})();\n"
],
"text/plain": [
"<IPython.core.display.Javascript object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%pip install jupyterlab-vim -q\n",
"%pip install git+https://github.com/cphyc/jupyter-notify.git -q\n",
"#%reload_ext jupyternotify\n",
"%load_ext jupyternotify"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d5946431-9f33-4caa-835b-ab09fb73d0e2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install pandas -q \\\n",
" html5lib -q \\\n",
" beautifulsoup4 -q"
]
},
{
"cell_type": "markdown",
"id": "bcbc54b8-7e77-446c-bb9d-4753bb61bdf9",
"metadata": {
"jp-MarkdownHeadingCollapsed": true,
"tags": []
},
"source": [
"## download cable bible"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c245e2c1-a5f1-4fa2-9037-7e1773208854",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mkdir: cannot create directory /tmp/scrape-demo: File exists\n",
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 150k 100 150k 0 0 1089k 0 --:--:-- --:--:-- --:--:-- 1089k\n",
"<!DOCTYPE html>\n",
"<html lang=\"en\">\n"
]
}
],
"source": [
"!mkdir /tmp/scrape-demo\n",
"!curl \\\n",
" https://raw.githubusercontent.com/amiaopensource/cable-bible/master/index.html \\\n",
" -o /tmp/scrape-demo/index.html\n",
"!head -2 /tmp/scrape-demo/index.html\n",
"#!cp /tmp/scrape-demo/index.html .\n",
"#!code ."
]
},
{
"cell_type": "markdown",
"id": "2e7e4c6d-6d1b-44fa-8527-dd847cd63895",
"metadata": {
"tags": []
},
"source": [
"## parse html"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b28f8b7c-05c0-462f-93ab-10ff0b35176f",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"from bs4 import BeautifulSoup\n",
"html = Path('/tmp/scrape-demo/index.html').read_text()\n",
"soup = BeautifulSoup(html, 'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "81767551-c9b6-4e98-80cf-1fc49b3a4262",
"metadata": {},
"outputs": [],
"source": [
"central_div = soup.find(\"div\", {\"class\": \"well col-md-8 col-md-offset-0\"})"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e134d6be-e32a-4cd7-91da-9ba359870e3f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['Video', 'Audio', 'Data', 'Power'])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a = {\n",
" e.h2.string: e for e\n",
" in central_div.find_all('div', id=lambda x: x != 'table_of_contents')\n",
" if e.h2 is not None}\n",
"a.keys()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "80ba6ebb-d679-4767-8917-348fe441f185",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['Analog Video', 'Digital Video', 'Integrated Video'])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b = {e.h3.string: e for e in\n",
" a['Video'].find_all(\"div\", {\"class\": \"well\"})\n",
" if e.find_all('h3', id=lambda x: x is not None)}\n",
"b.keys()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d414e989-ba00-4a60-a357-571286f36276",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['composite', 'component_ypbpr', 's-video', 'yc-688', 'rgbs', 'rgbvh'])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"c = {\n",
" e.h4['id']:e for e in\n",
" b['Analog Video'].find_all(\"div\", {\"class\": \"well\"})\n",
" if e.h4 is not None}\n",
"c.keys()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "97b1f814-c16e-42d3-96ff-b2faa27d5830",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['Composite RCA', 'Composite BNC', 'Composite UHF', 'Composite F-Type', 'Composite Video Patch (MUSA)', 'Composite 8-pin EIAJ', 'Composite SCART'])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d = {e.h3.string: e.p.string for e\n",
" in c['composite'].find_all(\"div\", {\"class\": \"modal fade\"})}\n",
"d.keys()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "2fca9ace-a92f-4eda-adbe-d20c845f2cd5",
"metadata": {},
"outputs": [
{
"data": {
"application/javascript": [
"$(document).ready(\n",
" function() {\n",
" function appendUniqueDiv(){\n",
" // append a div with our uuid so we can check that it's already\n",
" // been sent and avoid duplicates on page reload\n",
" var notifiedDiv = document.createElement(\"div\")\n",
" notifiedDiv.id = \"53b8a038-c6fd-4ea1-b291-5403fb3b742d\"\n",
" element.append(notifiedDiv)\n",
" }\n",
"\n",
" // only send notifications if the pageload is complete; this will\n",
" // help stop extra notifications when a saved notebook is loaded,\n",
" // which during testing gives us state \"interactive\", not \"complete\"\n",
" if (document.readyState === 'complete') {\n",
" // check for the div that signifies that the notification\n",
" // was already sent\n",
" if (document.getElementById(\"53b8a038-c6fd-4ea1-b291-5403fb3b742d\") === null) {\n",
" var notificationPayload = {\"requireInteraction\": false, \"icon\": \"/static/base/images/favicon.ico\", \"body\": \"Cell execution has finished!\", \"only_in_background\": false};\n",
"\n",
" // We have a notification but the window is active\n",
" if (notificationPayload.only_in_background && !window.jupyterNotifyIsInBackground) {\n",
" appendUniqueDiv();\n",
" return;\n",
" }\n",
" if (Notification.permission !== 'denied') {\n",
" if (Notification.permission !== 'granted') { \n",
" Notification.requestPermission(function (permission) {\n",
" if(!('permission' in Notification)) {\n",
" Notification.permission = permission\n",
" }\n",
" })\n",
" }\n",
" if (Notification.permission === 'granted') {\n",
" var notification = new Notification(\"Jupyter Notebook\", notificationPayload)\n",
" appendUniqueDiv()\n",
" notification.onclick = function () {\n",
" window.focus();\n",
" this.close();\n",
" };\n",
" } \n",
" } \n",
" }\n",
" }\n",
" }\n",
")\n"
],
"text/plain": [
"<IPython.core.display.Javascript object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%notify"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}