methodology/notebooks/html-parse.ipynb
2022-10-02 01:06:26 +02:00

378 lines
11 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "23283830-032b-484a-b599-66ba8e7c2001",
"metadata": {},
"source": [
"# HTML parse\n",
"\n",
"in this notebook:\n",
"* prepare jupyter environment\n",
"* download [cable bible](https://amiaopensource.github.io/cable-bible/) index.html file\n",
"* parse html"
]
},
{
"cell_type": "markdown",
"id": "ae76266b-b137-43ec-80f7-3f2c992215d9",
"metadata": {
"jp-MarkdownHeadingCollapsed": true,
"tags": []
},
"source": [
"## prepare jupyter environment"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "6360c8e3-8958-41c5-938a-f713af2ae715",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Note: you may need to restart the kernel to use updated packages.\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
},
{
"data": {
"application/javascript": [
"if (!(\"Notification\" in window)) {\n",
" alert(\"This browser does not support desktop notifications, so the %%notify magic will not work.\");\n",
"} else if (Notification.permission !== 'granted' && Notification.permission !== 'denied') {\n",
" Notification.requestPermission(function (permission) {\n",
" if(!('permission' in Notification)) {\n",
" Notification.permission = permission;\n",
" }\n",
" })\n",
"}\n",
"\n",
"if(!window.jQuery) {\n",
" var jq = document.createElement('script');\n",
" jq.src = \"//ajax.googleapis.com/ajax/libs/jquery/2.1.4/jquery.min.js\";\n",
" document.getElementsByTagName('head')[0].appendChild(jq);\n",
"}\n",
"\n",
"// Detect if the window is out of focus.\n",
"window.jupyterNotifyIsInBackground = undefined;\n",
"(function() {\n",
" // Check document.hidden support\n",
" var hidden;\n",
" if (typeof document.hidden !== \"undefined\") { // Opera 12.10 and Firefox 18 and later support\n",
" hidden = \"hidden\";\n",
" } else if (typeof document.msHidden !== \"undefined\") {\n",
" hidden = \"msHidden\";\n",
" } else if (typeof document.webkitHidden !== \"undefined\") {\n",
" hidden = \"webkitHidden\";\n",
" }\n",
"\n",
" // Set initial background state\n",
" if (document[hidden]) {\n",
" window.jupyterNotifyIsInBackground = true;\n",
" } else {\n",
" window.jupyterNotifyIsInBackground = false;\n",
" }\n",
"\n",
" window.addEventListener('blur', function() { window.jupyterNotifyIsInBackground = true; }, false);\n",
" window.addEventListener('focus', function() { window.jupyterNotifyIsInBackground = false; }, false);\n",
"})();\n"
],
"text/plain": [
"<IPython.core.display.Javascript object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%pip install jupyterlab-vim -q\n",
"%pip install git+https://github.com/cphyc/jupyter-notify.git -q\n",
"#%reload_ext jupyternotify\n",
"%load_ext jupyternotify"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d5946431-9f33-4caa-835b-ab09fb73d0e2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install pandas -q \\\n",
" html5lib -q \\\n",
" beautifulsoup4 -q"
]
},
{
"cell_type": "markdown",
"id": "bcbc54b8-7e77-446c-bb9d-4753bb61bdf9",
"metadata": {
"jp-MarkdownHeadingCollapsed": true,
"tags": []
},
"source": [
"## download cable bible"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c245e2c1-a5f1-4fa2-9037-7e1773208854",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mkdir: cannot create directory /tmp/scrape-demo: File exists\n",
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 150k 100 150k 0 0 1089k 0 --:--:-- --:--:-- --:--:-- 1089k\n",
"<!DOCTYPE html>\n",
"<html lang=\"en\">\n"
]
}
],
"source": [
"!mkdir /tmp/scrape-demo\n",
"!curl \\\n",
" https://raw.githubusercontent.com/amiaopensource/cable-bible/master/index.html \\\n",
" -o /tmp/scrape-demo/index.html\n",
"!head -2 /tmp/scrape-demo/index.html\n",
"#!cp /tmp/scrape-demo/index.html .\n",
"#!code ."
]
},
{
"cell_type": "markdown",
"id": "2e7e4c6d-6d1b-44fa-8527-dd847cd63895",
"metadata": {
"tags": []
},
"source": [
"## parse html"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b28f8b7c-05c0-462f-93ab-10ff0b35176f",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"from bs4 import BeautifulSoup\n",
"html = Path('/tmp/scrape-demo/index.html').read_text()\n",
"soup = BeautifulSoup(html, 'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "81767551-c9b6-4e98-80cf-1fc49b3a4262",
"metadata": {},
"outputs": [],
"source": [
"central_div = soup.find(\"div\", {\"class\": \"well col-md-8 col-md-offset-0\"})"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e134d6be-e32a-4cd7-91da-9ba359870e3f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['Video', 'Audio', 'Data', 'Power'])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a = {\n",
" e.h2.string: e for e\n",
" in central_div.find_all('div', id=lambda x: x != 'table_of_contents')\n",
" if e.h2 is not None}\n",
"a.keys()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "80ba6ebb-d679-4767-8917-348fe441f185",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['Analog Video', 'Digital Video', 'Integrated Video'])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b = {e.h3.string: e for e in\n",
" a['Video'].find_all(\"div\", {\"class\": \"well\"})\n",
" if e.find_all('h3', id=lambda x: x is not None)}\n",
"b.keys()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d414e989-ba00-4a60-a357-571286f36276",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['composite', 'component_ypbpr', 's-video', 'yc-688', 'rgbs', 'rgbvh'])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"c = {\n",
" e.h4['id']:e for e in\n",
" b['Analog Video'].find_all(\"div\", {\"class\": \"well\"})\n",
" if e.h4 is not None}\n",
"c.keys()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "97b1f814-c16e-42d3-96ff-b2faa27d5830",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['Composite RCA', 'Composite BNC', 'Composite UHF', 'Composite F-Type', 'Composite Video Patch (MUSA)', 'Composite 8-pin EIAJ', 'Composite SCART'])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d = {e.h3.string: e.p.string for e\n",
" in c['composite'].find_all(\"div\", {\"class\": \"modal fade\"})}\n",
"d.keys()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "2fca9ace-a92f-4eda-adbe-d20c845f2cd5",
"metadata": {},
"outputs": [
{
"data": {
"application/javascript": [
"$(document).ready(\n",
" function() {\n",
" function appendUniqueDiv(){\n",
" // append a div with our uuid so we can check that it's already\n",
" // been sent and avoid duplicates on page reload\n",
" var notifiedDiv = document.createElement(\"div\")\n",
" notifiedDiv.id = \"53b8a038-c6fd-4ea1-b291-5403fb3b742d\"\n",
" element.append(notifiedDiv)\n",
" }\n",
"\n",
" // only send notifications if the pageload is complete; this will\n",
" // help stop extra notifications when a saved notebook is loaded,\n",
" // which during testing gives us state \"interactive\", not \"complete\"\n",
" if (document.readyState === 'complete') {\n",
" // check for the div that signifies that the notification\n",
" // was already sent\n",
" if (document.getElementById(\"53b8a038-c6fd-4ea1-b291-5403fb3b742d\") === null) {\n",
" var notificationPayload = {\"requireInteraction\": false, \"icon\": \"/static/base/images/favicon.ico\", \"body\": \"Cell execution has finished!\", \"only_in_background\": false};\n",
"\n",
" // We have a notification but the window is active\n",
" if (notificationPayload.only_in_background && !window.jupyterNotifyIsInBackground) {\n",
" appendUniqueDiv();\n",
" return;\n",
" }\n",
" if (Notification.permission !== 'denied') {\n",
" if (Notification.permission !== 'granted') { \n",
" Notification.requestPermission(function (permission) {\n",
" if(!('permission' in Notification)) {\n",
" Notification.permission = permission\n",
" }\n",
" })\n",
" }\n",
" if (Notification.permission === 'granted') {\n",
" var notification = new Notification(\"Jupyter Notebook\", notificationPayload)\n",
" appendUniqueDiv()\n",
" notification.onclick = function () {\n",
" window.focus();\n",
" this.close();\n",
" };\n",
" } \n",
" } \n",
" }\n",
" }\n",
" }\n",
")\n"
],
"text/plain": [
"<IPython.core.display.Javascript object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%notify"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}