✨ add html parsing notebook
This commit is contained in:
parent
47ffcda44e
commit
a338bfaace
1 changed files with 378 additions and 0 deletions
378
notebooks/html-parse.ipynb
Normal file
378
notebooks/html-parse.ipynb
Normal file
|
|
@ -0,0 +1,378 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "23283830-032b-484a-b599-66ba8e7c2001",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# HTML parse\n",
|
||||
"\n",
|
||||
"in this notebook:\n",
|
||||
"* prepare jupyter environment\n",
|
||||
"* download [cable bible](https://amiaopensource.github.io/cable-bible/) index.html file\n",
|
||||
"* parse html"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae76266b-b137-43ec-80f7-3f2c992215d9",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true,
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"## prepare jupyter environment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "6360c8e3-8958-41c5-938a-f713af2ae715",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Note: you may need to restart the kernel to use updated packages.\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/javascript": [
|
||||
"if (!(\"Notification\" in window)) {\n",
|
||||
" alert(\"This browser does not support desktop notifications, so the %%notify magic will not work.\");\n",
|
||||
"} else if (Notification.permission !== 'granted' && Notification.permission !== 'denied') {\n",
|
||||
" Notification.requestPermission(function (permission) {\n",
|
||||
" if(!('permission' in Notification)) {\n",
|
||||
" Notification.permission = permission;\n",
|
||||
" }\n",
|
||||
" })\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"if(!window.jQuery) {\n",
|
||||
" var jq = document.createElement('script');\n",
|
||||
" jq.src = \"//ajax.googleapis.com/ajax/libs/jquery/2.1.4/jquery.min.js\";\n",
|
||||
" document.getElementsByTagName('head')[0].appendChild(jq);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"// Detect if the window is out of focus.\n",
|
||||
"window.jupyterNotifyIsInBackground = undefined;\n",
|
||||
"(function() {\n",
|
||||
" // Check document.hidden support\n",
|
||||
" var hidden;\n",
|
||||
" if (typeof document.hidden !== \"undefined\") { // Opera 12.10 and Firefox 18 and later support\n",
|
||||
" hidden = \"hidden\";\n",
|
||||
" } else if (typeof document.msHidden !== \"undefined\") {\n",
|
||||
" hidden = \"msHidden\";\n",
|
||||
" } else if (typeof document.webkitHidden !== \"undefined\") {\n",
|
||||
" hidden = \"webkitHidden\";\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" // Set initial background state\n",
|
||||
" if (document[hidden]) {\n",
|
||||
" window.jupyterNotifyIsInBackground = true;\n",
|
||||
" } else {\n",
|
||||
" window.jupyterNotifyIsInBackground = false;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" window.addEventListener('blur', function() { window.jupyterNotifyIsInBackground = true; }, false);\n",
|
||||
" window.addEventListener('focus', function() { window.jupyterNotifyIsInBackground = false; }, false);\n",
|
||||
"})();\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Javascript object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%pip install jupyterlab-vim -q\n",
|
||||
"%pip install git+https://github.com/cphyc/jupyter-notify.git -q\n",
|
||||
"#%reload_ext jupyternotify\n",
|
||||
"%load_ext jupyternotify"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "d5946431-9f33-4caa-835b-ab09fb73d0e2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%pip install pandas -q \\\n",
|
||||
" html5lib -q \\\n",
|
||||
" beautifulsoup4 -q"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bcbc54b8-7e77-446c-bb9d-4753bb61bdf9",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true,
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"## download cable bible"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "c245e2c1-a5f1-4fa2-9037-7e1773208854",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"mkdir: cannot create directory ‘/tmp/scrape-demo’: File exists\n",
|
||||
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
|
||||
" Dload Upload Total Spent Left Speed\n",
|
||||
"100 150k 100 150k 0 0 1089k 0 --:--:-- --:--:-- --:--:-- 1089k\n",
|
||||
"<!DOCTYPE html>\n",
|
||||
"<html lang=\"en\">\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!mkdir /tmp/scrape-demo\n",
|
||||
"!curl \\\n",
|
||||
" https://raw.githubusercontent.com/amiaopensource/cable-bible/master/index.html \\\n",
|
||||
" -o /tmp/scrape-demo/index.html\n",
|
||||
"!head -2 /tmp/scrape-demo/index.html\n",
|
||||
"#!cp /tmp/scrape-demo/index.html .\n",
|
||||
"#!code ."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2e7e4c6d-6d1b-44fa-8527-dd847cd63895",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"## parse html"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "b28f8b7c-05c0-462f-93ab-10ff0b35176f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"html = Path('/tmp/scrape-demo/index.html').read_text()\n",
|
||||
"soup = BeautifulSoup(html, 'html.parser')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "81767551-c9b6-4e98-80cf-1fc49b3a4262",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"central_div = soup.find(\"div\", {\"class\": \"well col-md-8 col-md-offset-0\"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "e134d6be-e32a-4cd7-91da-9ba359870e3f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"dict_keys(['Video', 'Audio', 'Data', 'Power'])"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a = {\n",
|
||||
" e.h2.string: e for e\n",
|
||||
" in central_div.find_all('div', id=lambda x: x != 'table_of_contents')\n",
|
||||
" if e.h2 is not None}\n",
|
||||
"a.keys()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "80ba6ebb-d679-4767-8917-348fe441f185",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"dict_keys(['Analog Video', 'Digital Video', 'Integrated Video'])"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"b = {e.h3.string: e for e in\n",
|
||||
" a['Video'].find_all(\"div\", {\"class\": \"well\"})\n",
|
||||
" if e.find_all('h3', id=lambda x: x is not None)}\n",
|
||||
"b.keys()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "d414e989-ba00-4a60-a357-571286f36276",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"dict_keys(['composite', 'component_ypbpr', 's-video', 'yc-688', 'rgbs', 'rgbvh'])"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"c = {\n",
|
||||
" e.h4['id']:e for e in\n",
|
||||
" b['Analog Video'].find_all(\"div\", {\"class\": \"well\"})\n",
|
||||
" if e.h4 is not None}\n",
|
||||
"c.keys()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "97b1f814-c16e-42d3-96ff-b2faa27d5830",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"dict_keys(['Composite RCA', 'Composite BNC', 'Composite UHF', 'Composite F-Type', 'Composite Video Patch (MUSA)', 'Composite 8-pin EIAJ', 'Composite SCART'])"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"d = {e.h3.string: e.p.string for e\n",
|
||||
" in c['composite'].find_all(\"div\", {\"class\": \"modal fade\"})}\n",
|
||||
"d.keys()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "2fca9ace-a92f-4eda-adbe-d20c845f2cd5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/javascript": [
|
||||
"$(document).ready(\n",
|
||||
" function() {\n",
|
||||
" function appendUniqueDiv(){\n",
|
||||
" // append a div with our uuid so we can check that it's already\n",
|
||||
" // been sent and avoid duplicates on page reload\n",
|
||||
" var notifiedDiv = document.createElement(\"div\")\n",
|
||||
" notifiedDiv.id = \"53b8a038-c6fd-4ea1-b291-5403fb3b742d\"\n",
|
||||
" element.append(notifiedDiv)\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" // only send notifications if the pageload is complete; this will\n",
|
||||
" // help stop extra notifications when a saved notebook is loaded,\n",
|
||||
" // which during testing gives us state \"interactive\", not \"complete\"\n",
|
||||
" if (document.readyState === 'complete') {\n",
|
||||
" // check for the div that signifies that the notification\n",
|
||||
" // was already sent\n",
|
||||
" if (document.getElementById(\"53b8a038-c6fd-4ea1-b291-5403fb3b742d\") === null) {\n",
|
||||
" var notificationPayload = {\"requireInteraction\": false, \"icon\": \"/static/base/images/favicon.ico\", \"body\": \"Cell execution has finished!\", \"only_in_background\": false};\n",
|
||||
"\n",
|
||||
" // We have a notification but the window is active\n",
|
||||
" if (notificationPayload.only_in_background && !window.jupyterNotifyIsInBackground) {\n",
|
||||
" appendUniqueDiv();\n",
|
||||
" return;\n",
|
||||
" }\n",
|
||||
" if (Notification.permission !== 'denied') {\n",
|
||||
" if (Notification.permission !== 'granted') { \n",
|
||||
" Notification.requestPermission(function (permission) {\n",
|
||||
" if(!('permission' in Notification)) {\n",
|
||||
" Notification.permission = permission\n",
|
||||
" }\n",
|
||||
" })\n",
|
||||
" }\n",
|
||||
" if (Notification.permission === 'granted') {\n",
|
||||
" var notification = new Notification(\"Jupyter Notebook\", notificationPayload)\n",
|
||||
" appendUniqueDiv()\n",
|
||||
" notification.onclick = function () {\n",
|
||||
" window.focus();\n",
|
||||
" this.close();\n",
|
||||
" };\n",
|
||||
" } \n",
|
||||
" } \n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
")\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Javascript object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%notify"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue