Skip to content

Commit 46e6072

Browse files
committed
fixup! Minimal scraper working from end-to-end
1 parent 4f4c5ba commit 46e6072

19 files changed

Lines changed: 186 additions & 222 deletions

File tree

.github/workflows/Tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,4 +105,4 @@ jobs:
105105
run: docker run -v $PWD/output:/output libretexts2zim libretexts2zim --library-slug geo --library-name Geosciences --file-name-format "tests_en_libretexts-geo"
106106

107107
- name: Run integration test suite
108-
run: docker run -v $PWD/scraper/tests-integration:/src/scraper/tests-integration -v $PWD/output:/output libretexts2zim bash -c "pip install pytest; pytest -v /src/scraper/tests-integration"
108+
run: docker run -v $PWD/scraper/tests-integration:/src/scraper/tests-integration -v $PWD/output:/output -e ZIM_FILE_PATH=/output/tests_en_libretexts-geo.zim libretexts2zim bash -c "pip install pytest; pytest -v /src/scraper/tests-integration"

.gitignore

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -312,5 +312,6 @@ pyrightconfig.json
312312

313313
# End of https://www.toptal.com/developers/gitignore/api/node,python
314314

315-
316-
output
315+
.vscode
316+
output
317+
tmp

.vscode/extensions.json

Lines changed: 0 additions & 7 deletions
This file was deleted.

.vscode/settings.json

Lines changed: 0 additions & 22 deletions
This file was deleted.

CONTRIBUTING.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ To simplify this, it is possible to:
1616
- extract assets from generated files and place them in a directory where ZIM UI will find them
1717
- iterate on ZIM UI code
1818

19+
This needs to be done everytime you make significant changes to the scraper (Python code) that have impact on files used by the Vue.JS UI.
20+
1921
To achieve this, first build the Docker image based on current code base.
2022

2123
```
@@ -31,7 +33,7 @@ docker run --rm -it -v "$PWD/output":/output local-libretexts2zim libretexts2zim
3133
Extract interesting ZIM content and move it to `public` folder.
3234

3335
```
34-
find zimui/public/content -mindepth 1 -delete
36+
rm -rf zimui/public/content
3537
docker run -it --rm -v $(pwd)/output:/data ghcr.io/openzim/zim-tools:latest zimdump dump --dir=/data/tests_en_libretexts-geo /data/tests_en_libretexts-geo.zim
3638
sudo chown -R $(id -u -n):$(id -g -n) output/tests_en_libretexts-geo
3739
mkdir -p zimui/public/content
@@ -46,7 +48,7 @@ cd zimui
4648
yarn dev
4749
```
4850

49-
Do not forget to cleanup `public` folder before building the docker image again, otherwise all assets will be pushed to the ZIM.
51+
Do not forget to cleanup `public/content` folder before building the docker image again, otherwise all assets will be pushed to the ZIM.
5052

5153
```
5254
rm -rf zimui/public/content

scraper/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
55
[project]
66
name = "libretexts2zim"
77
requires-python = ">=3.12,<3.13"
8-
description = "Make ZIM file from LibreTexts courses"
8+
description = "Make ZIM file from LibreTexts libraries"
99
readme = "../README.md"
1010
dependencies = [
1111
"yt-dlp", # youtube-dl should be updated as frequently as possible
@@ -19,6 +19,7 @@ dependencies = [
1919
"schedule==1.2.2",
2020
"beautifulsoup4==4.12.3",
2121
"types-beautifulsoup4==4.12.0.20240907",
22+
"lxml==5.3.0",
2223
]
2324
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
2425

scraper/src/libretexts2zim/client.py

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,25 +15,18 @@ class LibreTextsParsingError(Exception):
1515
pass
1616

1717

18-
class LibreTextsShelve(BaseModel):
19-
title: str
20-
content_url: str
21-
image_url: str
22-
23-
2418
class LibreTextsHome(BaseModel):
2519
welcome_text_paragraphs: list[str]
26-
shelves: list[LibreTextsShelve]
2720
welcome_image_url: str
2821

2922

3023
class LibreTextsMetadata(BaseModel):
31-
"""Metadata about a course."""
24+
"""Metadata about a library."""
3225

33-
# Human readable name for the course.
26+
# Human readable name for the library.
3427
name: str
3528

36-
# URL prefix for the course, e.g. for Geosciences which is at
29+
# URL prefix for the library, e.g. for Geosciences which is at
3730
# https://geo.libretexts.org/, the slug is `geo`
3831
slug: str
3932

@@ -91,12 +84,11 @@ def get_home(self) -> LibreTextsHome:
9184
return LibreTextsHome(
9285
welcome_text_paragraphs=_get_welcome_text_from_home(soup),
9386
welcome_image_url=_get_welcome_image_url_from_home(soup),
94-
shelves=[],
9587
)
9688

9789

9890
def _get_soup(content: str) -> BeautifulSoup:
99-
return BeautifulSoup(content, "html.parser")
91+
return BeautifulSoup(content, "lxml")
10092

10193

10294
def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
@@ -125,11 +117,7 @@ def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]:
125117
"<section> with class 'mt-content-container' not found"
126118
)
127119
welcome_text: list[str] = []
128-
for paragraph in content_section.find_all("p"):
129-
if paragraph.find("div", class_="mt-category-container"):
130-
# once we found a mt-category-container div, we are not in the welcome text
131-
# anymore
132-
break
120+
for paragraph in content_section.find_all("p", recursive=False):
133121
if paragraph_text := paragraph.text:
134122
welcome_text.append(paragraph_text)
135123
return welcome_text

scraper/src/libretexts2zim/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
VERSION = __version__
1212
ROOT_DIR = pathlib.Path(__file__).parent
1313

14-
# As of 2024-09-24, all courses appears to be in English.
14+
# As of 2024-09-24, all libraries appears to be in English.
1515
LANGUAGE_ISO_639_3 = "eng"
1616

1717
logger = getLogger(NAME, level=logging.DEBUG)

scraper/src/libretexts2zim/entrypoint.py

Lines changed: 140 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
11
import argparse
22
import logging
33
import os
4+
from pathlib import Path
5+
6+
from zimscraperlib.constants import (
7+
MAXIMUM_DESCRIPTION_METADATA_LENGTH,
8+
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH,
9+
RECOMMENDED_MAX_TITLE_LENGTH,
10+
)
411

512
from libretexts2zim.client import LibreTextsClient
613
from libretexts2zim.constants import (
714
NAME,
815
VERSION,
916
logger,
1017
)
11-
from libretexts2zim.generator import ContentFilter, Generator
18+
from libretexts2zim.processor import ContentFilter, Processor
1219
from libretexts2zim.zimconfig import ZimConfig
1320

1421

@@ -28,13 +35,111 @@ def zim_defaults() -> ZimConfig:
2835
)
2936

3037

31-
def main() -> None:
32-
parser = argparse.ArgumentParser(
33-
prog=NAME,
38+
def add_zim_config_flags(parser: argparse.ArgumentParser, defaults: "ZimConfig"):
39+
"""
40+
Adds flags related to zim configuration
41+
42+
Flags are added to the given parser with given defaults.
43+
"""
44+
45+
parser.add_argument(
46+
"--library-name",
47+
help="Display name for the library, e.g. Geosciences",
48+
required=True,
3449
)
3550

3651
parser.add_argument(
37-
"--debug", help="Enable verbose output", action="store_true", default=False
52+
"--creator",
53+
help=f"Name of content creator. Default: {defaults.creator!r}",
54+
default=defaults.creator,
55+
)
56+
57+
parser.add_argument(
58+
"--publisher",
59+
help=f"Custom publisher name. Default: {defaults.publisher!r}",
60+
default=defaults.publisher,
61+
)
62+
63+
parser.add_argument(
64+
"--file-name-format",
65+
help="Custom file name format for individual ZIMs. "
66+
f"Default: {defaults.file_name_format!r}",
67+
default=defaults.file_name_format,
68+
metavar="FORMAT",
69+
)
70+
71+
parser.add_argument(
72+
"--name-format",
73+
help="Custom name format for individual ZIMs. "
74+
f"Default: {defaults.name_format!r}",
75+
default=defaults.name_format,
76+
metavar="FORMAT",
77+
)
78+
79+
parser.add_argument(
80+
"--title-format",
81+
help=f"Custom title format for individual ZIMs. Final value must not be "
82+
f"longer than {RECOMMENDED_MAX_TITLE_LENGTH} chars. "
83+
f"Default: {defaults.title_format!r}",
84+
default=defaults.title_format,
85+
metavar="FORMAT",
86+
)
87+
88+
parser.add_argument(
89+
"--description-format",
90+
help="Custom description format for individual ZIMs. Final value must not "
91+
f"be longer than {MAXIMUM_DESCRIPTION_METADATA_LENGTH} chars. "
92+
f"Default: {defaults.title_format!r}",
93+
default=defaults.description_format,
94+
metavar="FORMAT",
95+
)
96+
97+
parser.add_argument(
98+
"--long-description-format",
99+
help="Custom long description format for your ZIM. Final value must not be "
100+
f"longer than {MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH} chars. "
101+
f"Default: {defaults.long_description_format!r}",
102+
default=defaults.long_description_format,
103+
metavar="FORMAT",
104+
)
105+
106+
# Due to https://github.com/python/cpython/issues/60603 defaulting an array in
107+
# argparse doesn't work so we expose the underlying semicolon delimited string.
108+
parser.add_argument(
109+
"--tags",
110+
help="A semicolon (;) delimited list of tags to add to the ZIM."
111+
"Formatting is supported. "
112+
f"Default: {defaults.tags!r}",
113+
default=defaults.tags,
114+
)
115+
116+
parser.add_argument(
117+
"--secondary-color",
118+
help="Secondary (background) color of ZIM UI. Default: "
119+
f"{defaults.secondary_color!r}",
120+
default=defaults.secondary_color,
121+
)
122+
123+
124+
def add_content_filter_flags(parser: argparse.ArgumentParser):
125+
"""Adds flags related to content filtering to the given parser."""
126+
127+
parser.add_argument(
128+
"--shelves-include",
129+
help="Includes only shelves matching the given regular expression.",
130+
metavar="REGEX",
131+
)
132+
133+
parser.add_argument(
134+
"--shelves-exclude",
135+
help="Excludes shelves matching the given regular expression.",
136+
metavar="REGEX",
137+
)
138+
139+
140+
def main() -> None:
141+
parser = argparse.ArgumentParser(
142+
prog=NAME,
38143
)
39144

40145
parser.add_argument(
@@ -44,39 +149,41 @@ def main() -> None:
44149
version=VERSION,
45150
)
46151

152+
# Client configuration flags
153+
parser.add_argument(
154+
"--library-slug",
155+
help="URL prefix for the library, e.g. for Geosciences which is at "
156+
"https://geo.libretexts.org/, the slug is `geo`",
157+
required=True,
158+
)
159+
160+
# ZIM configuration flags
161+
add_zim_config_flags(parser, zim_defaults())
162+
163+
# Document selection flags
164+
add_content_filter_flags(parser)
165+
47166
parser.add_argument(
48167
"--output",
49168
help="Output folder for ZIMs. Default: /output",
50169
default="/output",
51170
dest="output_folder",
52171
)
53172

173+
parser.add_argument(
174+
"--debug", help="Enable verbose output", action="store_true", default=False
175+
)
176+
54177
parser.add_argument(
55178
"--zimui-dist",
56179
type=str,
57180
help=(
58-
"Directory containing Vite build output from the Zim UI Vue.JS application"
181+
"Dev option to customize directory containing Vite build output from the "
182+
"ZIM UI Vue.JS application"
59183
),
60184
default=os.getenv("LIBRETEXTS_ZIMUI_DIST", "../zimui/dist"),
61185
)
62186

63-
# ZIM configuration flags
64-
ZimConfig.add_flags(
65-
parser,
66-
zim_defaults(),
67-
)
68-
69-
# Document selection flags
70-
ContentFilter.add_flags(parser)
71-
72-
# Client configuration flags
73-
parser.add_argument(
74-
"--library-slug",
75-
help="URL prefix for the course, e.g. for Geosciences which is at "
76-
"https://geo.libretexts.org/, the slug is `geo`",
77-
required=True,
78-
)
79-
80187
args = parser.parse_args()
81188

82189
logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)
@@ -88,17 +195,20 @@ def main() -> None:
88195
library_slug=args.library_slug,
89196
)
90197

91-
Generator(
198+
Processor(
92199
libretexts_client=libretexts_client,
93200
zim_config=zim_config,
94-
output_folder=args.output_folder,
95-
zimui_dist=args.zimui_dist,
201+
output_folder=Path(args.output_folder),
202+
zimui_dist=Path(args.zimui_dist),
96203
content_filter=doc_filter,
97204
).run()
98-
except Exception as e:
99-
logger.exception(e)
100-
logger.error(f"Generation failed with the following error: {e}")
101-
raise SystemExit(1) from e
205+
except SystemExit:
206+
logger.error("Generation failed, exiting")
207+
raise
208+
except Exception as exc:
209+
logger.exception(exc)
210+
logger.error(f"Generation failed with the following error: {exc}")
211+
raise SystemExit(1) from exc
102212

103213

104214
if __name__ == "__main__":

0 commit comments

Comments
 (0)