refactor: replace generic catalog DTOs with shape-typed DTOs for type safety

Replace the single CatalogMaterialDto + CatalogDimensionsDto (bag of nullable
fields) with per-shape DTOs that have strongly-typed dimension properties.
Catalog JSON now groups materials by shape key instead of a flat array.
Delete the old SeedController/SeedDataDtos (superseded by CatalogService).
Scraper updated to emit the new grouped format, resume by default, and
save items incrementally.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-25 15:48:35 -05:00
parent c31769a746
commit 7d3c92226c
8 changed files with 13874 additions and 15830 deletions

View File

@@ -4,6 +4,11 @@
"Bars",
"A-36",
"ROUND"
],
[
"Bars",
"A-36",
"FLAT"
]
],
"items": [

View File

@@ -5,10 +5,10 @@ Scrapes myalro.com's SmartGrid for Carbon Steel materials and outputs
a catalog JSON matching the O'Neal catalog format.
Usage:
python scrape_alro.py # Scrape filtered grades (edit GRADE_FILTER below)
python scrape_alro.py # Scrape filtered grades (resumes from saved progress)
python scrape_alro.py --all-grades # Scrape ALL grades (slow)
python scrape_alro.py --discover # Scrape first item only, dump HTML/screenshots
python scrape_alro.py --resume # Resume from saved progress
python scrape_alro.py --fresh # Start fresh, ignoring saved progress
"""
import asyncio
@@ -342,8 +342,14 @@ async def get_select_options(page: Page, sel_id: str):
async def scrape_dims_panel(page: Page, grade: str, shape_alro: str,
shape_mapped: str, *, save_discovery: bool = False):
"""Main Level 3 extraction. Returns list of raw item dicts."""
shape_mapped: str, *, save_discovery: bool = False,
on_item=None, scraped_dim_a: set[str] | None = None):
"""Main Level 3 extraction. Returns list of raw item dicts.
If on_item callback is provided, it is called with each item dict
as soon as it is discovered (for incremental saving).
If scraped_dim_a is provided, DimA values in that set are skipped (resume).
"""
items: list[dict] = []
if save_discovery:
@@ -368,9 +374,18 @@ async def scrape_dims_panel(page: Page, grade: str, shape_alro: str,
log.warning(f" Could not dump dims panel: {e}")
return []
log.info(f" DimA: {len(dim_a_opts)} sizes")
already_done = scraped_dim_a or set()
remaining = [(v, t) for v, t in dim_a_opts if v not in already_done]
if already_done:
log.info(f" DimA: {len(dim_a_opts)} sizes ({len(dim_a_opts) - len(remaining)} already scraped, {len(remaining)} remaining)")
else:
log.info(f" DimA: {len(dim_a_opts)} sizes")
for a_val, a_text in dim_a_opts:
# All DimA values already scraped — combo is complete
if not remaining:
return []
for a_val, a_text in remaining:
# Select DimA → triggers postback → DimB/Length populate
await page.select_option(f"#{ID['dim_a']}", a_val)
await asyncio.sleep(DELAY)
@@ -394,29 +409,38 @@ async def scrape_dims_panel(page: Page, grade: str, shape_alro: str,
lengths = await get_select_options(page, ID["dim_length"])
for l_val, l_text in lengths:
items.append(_make_item(
item = _make_item(
grade, shape_mapped,
a_val, a_text, b_val, b_text, c_val, c_text,
l_text,
))
)
items.append(item)
if on_item:
on_item(item)
else:
# No DimC — read lengths
lengths = await get_select_options(page, ID["dim_length"])
for l_val, l_text in lengths:
items.append(_make_item(
item = _make_item(
grade, shape_mapped,
a_val, a_text, b_val, b_text, None, None,
l_text,
))
)
items.append(item)
if on_item:
on_item(item)
else:
# No DimB — just DimA + Length
lengths = await get_select_options(page, ID["dim_length"])
for l_val, l_text in lengths:
items.append(_make_item(
item = _make_item(
grade, shape_mapped,
a_val, a_text, None, None, None, None,
l_text,
))
)
items.append(item)
if on_item:
on_item(item)
return items
@@ -467,7 +491,7 @@ def build_size_and_dims(shape: str, item: dict):
return f'{a_txt}"', {"width": round(a, 4), "thickness": 0}
if shape == "SquareBar" and a is not None:
return f'{a_txt}"', {"size": round(a, 4)}
return f'{a_txt}"', {"sideLength": round(a, 4)}
if shape == "Angle":
if a is not None and b is not None:
@@ -495,9 +519,9 @@ def build_size_and_dims(shape: str, item: dict):
if shape == "SquareTube":
if a is not None and b is not None:
return (f'{a_txt}" x {b_txt}" wall',
{"size": round(a, 4), "wall": round(b, 4)})
{"sideLength": round(a, 4), "wall": round(b, 4)})
if a is not None:
return f'{a_txt}"', {"size": round(a, 4), "wall": 0}
return f'{a_txt}"', {"sideLength": round(a, 4), "wall": 0}
if shape == "RectangularTube":
if a is not None and b is not None and c is not None:
@@ -524,6 +548,20 @@ def build_size_and_dims(shape: str, item: dict):
return a_txt or "", {}
SHAPE_GROUP_KEY = {
"Angle": "angles",
"Channel": "channels",
"FlatBar": "flatBars",
"IBeam": "iBeams",
"Pipe": "pipes",
"RectangularTube": "rectangularTubes",
"RoundBar": "roundBars",
"RoundTube": "roundTubes",
"SquareBar": "squareBars",
"SquareTube": "squareTubes",
}
def build_catalog(scraped: list[dict]) -> dict:
"""Assemble the final catalog JSON from scraped item dicts."""
materials: dict[tuple, dict] = {}
@@ -538,14 +576,14 @@ def build_catalog(scraped: list[dict]) -> dict:
key = (shape, grade, size_str)
if key not in materials:
materials[key] = {
"shape": shape,
mat = {
"type": "Steel",
"grade": grade,
"size": size_str,
"dimensions": dims,
"stockItems": [],
}
mat.update(dims)
materials[key] = mat
length = item.get("length_inches")
if length and length > 0:
@@ -561,7 +599,12 @@ def build_catalog(scraped: list[dict]) -> dict:
}],
})
sorted_mats = sorted(materials.values(), key=lambda m: (m["shape"], m["grade"], m["size"]))
# Group by shape key
grouped: dict[str, list] = {v: [] for v in SHAPE_GROUP_KEY.values()}
for (shape, _, _), mat in sorted(materials.items(), key=lambda kv: (kv[0][0], kv[0][1], kv[0][2])):
group_key = SHAPE_GROUP_KEY.get(shape)
if group_key:
grouped[group_key].append(mat)
return {
"exportedAt": datetime.now(timezone.utc).isoformat(),
@@ -572,7 +615,7 @@ def build_catalog(scraped: list[dict]) -> dict:
{"name": "Cold Cut Saw", "kerfInches": 0.0625, "isDefault": False},
{"name": "Hacksaw", "kerfInches": 0.0625, "isDefault": False},
],
"materials": sorted_mats,
"materials": grouped,
}
@@ -596,20 +639,29 @@ def save_progress(progress: dict):
async def main():
discover = "--discover" in sys.argv
resume = "--resume" in sys.argv
fresh = "--fresh" in sys.argv
all_grades = "--all-grades" in sys.argv
progress = load_progress() if resume else {"completed": [], "items": []}
progress = {"completed": [], "items": []} if fresh else load_progress()
all_items: list[dict] = progress.get("items", [])
done_keys: set[tuple] = {tuple(k) for k in progress.get("completed", [])}
# Build index of saved DimA values per (grade, shape) for partial resume
saved_dim_a: dict[tuple[str, str], set[str]] = {}
if all_items and not fresh:
for item in all_items:
key = (item.get("grade", ""), item.get("shape", ""))
saved_dim_a.setdefault(key, set()).add(item.get("dim_a_val", ""))
log.info("Alro Steel SmartGrid Scraper")
if all_grades:
log.info(" Mode: ALL grades")
else:
log.info(f" Filtering to {len(GRADE_FILTER)} grades: {', '.join(sorted(GRADE_FILTER))}")
if resume:
log.info(f" Resuming: {len(done_keys)} combos done, {len(all_items)} items")
if fresh:
log.info(" Fresh start — ignoring saved progress")
elif done_keys:
log.info(f" Resuming: {len(done_keys)} combos done, {len(all_items)} items saved")
if discover:
log.info(" Discovery mode — will scrape first item then stop")
@@ -677,19 +729,30 @@ async def main():
await asyncio.sleep(DELAY)
combo_count = 0
def on_item_discovered(item):
nonlocal total_scraped, combo_count
all_items.append(item)
total_scraped += 1
combo_count += 1
progress["items"] = all_items
save_progress(progress)
# Pass already-scraped DimA values so partial combos resume correctly
already = saved_dim_a.get((grade_name, shape_mapped), set())
items = await scrape_dims_panel(
page, grade_name, shape_name, shape_mapped,
save_discovery=first_item or discover,
on_item=on_item_discovered,
scraped_dim_a=already,
)
first_item = False
all_items.extend(items)
total_scraped += len(items)
log.info(f" -> {len(items)} items (total {total_scraped})")
log.info(f" -> {combo_count} items (total {total_scraped})")
done_keys.add(combo_key)
progress["completed"] = [list(k) for k in done_keys]
progress["items"] = all_items
save_progress(progress)
await click_back(page)
@@ -714,14 +777,13 @@ async def main():
OUTPUT_PATH.write_text(json.dumps(catalog, indent=2, ensure_ascii=False), encoding="utf-8")
log.info(f"Written: {OUTPUT_PATH}")
log.info(f"Materials: {len(catalog['materials'])}")
total_stock = sum(len(m["stockItems"]) for m in catalog["materials"])
total_mats = sum(len(v) for v in catalog["materials"].values())
total_stock = sum(len(m["stockItems"]) for v in catalog["materials"].values() for m in v)
log.info(f"Materials: {total_mats}")
log.info(f"Stock items: {total_stock}")
by_shape: dict[str, int] = {}
for m in catalog["materials"]:
by_shape[m["shape"]] = by_shape.get(m["shape"], 0) + 1
for s, n in sorted(by_shape.items()):
log.info(f" {s}: {n}")
for shape_key, mats in sorted(catalog["materials"].items()):
if mats:
log.info(f" {shape_key}: {len(mats)}")
if __name__ == "__main__":