#!/usr/bin/env python3 """ Alro Steel SmartGrid Scraper Scrapes myalro.com's SmartGrid for Carbon Steel materials and outputs a catalog JSON matching the O'Neal catalog format. Usage: python scrape_alro.py # Scrape filtered grades (resumes from saved progress) python scrape_alro.py --all-grades # Scrape ALL grades (slow) python scrape_alro.py --discover # Scrape first item only, dump HTML/screenshots python scrape_alro.py --fresh # Start fresh, ignoring saved progress """ import asyncio import json import re import sys import logging from datetime import datetime, timezone from pathlib import Path from playwright.async_api import async_playwright, Page, TimeoutError as PwTimeout from playwright_stealth import Stealth # ── Logging ────────────────────────────────────────────────────────── logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S", ) log = logging.getLogger(__name__) # ── Paths ──────────────────────────────────────────────────────────── SCRIPT_DIR = Path(__file__).parent.resolve() OUTPUT_PATH = (SCRIPT_DIR / "../../CutList.Web/Data/SeedData/alro-catalog.json").resolve() PROGRESS_PATH = SCRIPT_DIR / "alro-scrape-progress.json" SCREENSHOTS_DIR = SCRIPT_DIR / "screenshots" # ── Config ─────────────────────────────────────────────────────────── BASE_URL = "https://www.myalro.com/SmartGrid.aspx?PT=Steel&Clear=true" DELAY = 5 # seconds between postback clicks TIMEOUT = 15_000 # ms for element waits CS_ROW = 4 # Carbon Steel row index in main grid CATEGORIES = ["Bars", "Pipe / Tube", "Structural"] # ┌─────────────────────────────────────────────────────────────────┐ # │ GRADE FILTER — only these grades will be scraped. │ # │ Use --all-grades flag to override and scrape everything. │ # │ Grade names must match the gpname attribute exactly. │ # └─────────────────────────────────────────────────────────────────┘ GRADE_FILTER = { # Common structural / general purpose "A-36", # Mild steel "1018 CF", "1018 HR", # Medium carbon (shafts, gears, pins) "1045 CF", "1045 HR", "1045 TG&P", # Free-machining "1144 CF", "1144 HR", "12L14 CF", # Hot-rolled plate/bar "1044 HR", # Stressproof (high-strength shafting) "A311/Stressproof", } # Alro shape column header → our MaterialShape enum SHAPE_MAP = { "ROUND": "RoundBar", "FLAT": "FlatBar", "SQUARE": "SquareBar", "ANGLE": "Angle", "CHANNEL": "Channel", "BEAM": "IBeam", "SQ TUBE": "SquareTube", "SQUARE TUBE": "SquareTube", "REC TUBE": "RectangularTube", "RECT TUBE": "RectangularTube", "RECTANGULAR TUBE": "RectangularTube", "ROUND TUBE": "RoundTube", "RND TUBE": "RoundTube", "PIPE": "Pipe", } # ── ASP.NET control IDs ───────────────────────────────────────── _CP = "ctl00_ContentPlaceHolder1" _PU = f"{_CP}_pnlPopUP" ID = dict( main_grid = f"{_CP}_grdMain", popup_grid = f"{_PU}_grdPopUp", popup_window = f"{_PU}_Window", dims_panel = f"{_PU}_upnlDims", back_btn = f"{_PU}_btnBack", # Dimension dropdowns (cascading: A → B → C → Length) dim_a = f"{_PU}_ddlDimA", dim_b = f"{_PU}_ddlDimB", dim_c = f"{_PU}_ddlDimC", dim_length = f"{_PU}_ddlLength", btn_next = f"{_PU}_btnSearch", ) # Postback targets ($ separators) PB = dict( main_grid = "ctl00$ContentPlaceHolder1$grdMain", popup_grid = "ctl00$ContentPlaceHolder1$pnlPopUP$grdPopUp", back_btn = "ctl00$ContentPlaceHolder1$pnlPopUP$btnBack", popup = "ctl00$ContentPlaceHolder1$pnlPopUP", dim_a = "ctl00$ContentPlaceHolder1$pnlPopUP$ddlDimA", dim_b = "ctl00$ContentPlaceHolder1$pnlPopUP$ddlDimB", dim_c = "ctl00$ContentPlaceHolder1$pnlPopUP$ddlDimC", ) # ═══════════════════════════════════════════════════════════════════════ # Utility helpers # ═══════════════════════════════════════════════════════════════════════ def parse_fraction(s: str) -> float | None: """Parse fraction/decimal string → float. '1-1/4' → 1.25, '.250' → 0.25""" if not s: return None s = s.strip().strip('"\'') # Collapse double spaces from Alro dropdown text ("1 1/4" → "1 1/4") s = re.sub(r"\s+", " ", s) if not s: return None try: return float(s) except ValueError: pass # Mixed fraction: "1-1/4" or "1 1/4" m = re.match(r"^(\d+)[\s-](\d+)/(\d+)$", s) if m: return int(m[1]) + int(m[2]) / int(m[3]) m = re.match(r"^(\d+)/(\d+)$", s) if m: return int(m[1]) / int(m[2]) m = re.match(r"^(\d+)$", s) if m: return float(m[1]) return None def decimal_to_fraction(value: float) -> str: """0.25 → '1/4', 1.25 → '1-1/4', 3.0 → '3'""" if value <= 0: return "0" whole = int(value) frac = value - whole if abs(frac) < 0.001: return str(whole) from math import gcd sixteenths = round(frac * 16) if sixteenths == 16: return str(whole + 1) g = gcd(sixteenths, 16) num, den = sixteenths // g, 16 // g frac_s = f"{num}/{den}" return f"{whole}-{frac_s}" if whole else frac_s def normalize_dim_text(s: str) -> str: """Normalize dimension text: '1 1/4' → '1-1/4', '3/16' → '3/16'""" s = re.sub(r"\s+", " ", s.strip()) # "1 1/4" → "1-1/4" (mixed fraction with space → hyphen) s = re.sub(r"^(\d+)\s+(\d+/\d+)$", r"\1-\2", s) return s def parse_length_to_inches(text: str) -> float | None: """Parse length string to inches. \"20'\" → 240, \"240\" → 240""" s = text.strip().upper() s = re.sub(r"\s*(RL|RANDOM.*|LENGTHS?|EA|EACH|STOCK)\s*", "", s).strip() m = re.match(r"^(\d+(?:\.\d+)?)\s*['\u2032]", s) if m: return float(m[1]) * 12 m = re.match(r"^(\d+(?:\.\d+)?)\s*FT", s) if m: return float(m[1]) * 12 m = re.match(r'^(\d+(?:\.\d+)?)\s*"?\s*$', s) if m: v = float(m[1]) return v * 12 if v <= 30 else v return None # ═══════════════════════════════════════════════════════════════════════ # SmartGrid navigation # ═══════════════════════════════════════════════════════════════════════ async def wait_for_update(page: Page, timeout: int = TIMEOUT): """Wait for ASP.NET partial postback to finish.""" try: await page.wait_for_load_state("networkidle", timeout=timeout) except PwTimeout: log.warning(" networkidle timeout – continuing") await asyncio.sleep(0.5) async def do_postback(page: Page, target: str, arg: str): """Execute a __doPostBack call.""" await page.evaluate(f"__doPostBack('{target}', '{arg}')") async def click_category(page: Page, category: str) -> bool: """Click a category blue-button for Carbon Steel in the main grid.""" log.info(f"Clicking main grid: {category} (row {CS_ROW})") arg = f"{category}${CS_ROW}" link = await page.query_selector( f"#{ID['main_grid']} a[href*=\"'{arg}'\"] img[src*='blue_button']" ) if not link: log.error(f" Button not found for {arg}") return False parent = await link.evaluate_handle("el => el.parentElement") await parent.as_element().click() try: await page.wait_for_selector(f"#{ID['popup_grid']}", state="visible", timeout=TIMEOUT) await wait_for_update(page) return True except PwTimeout: log.error(f" Popup did not appear for {category}") return False async def scrape_popup_grid(page: Page): """Parse the popup grid → [(grade_name, grade_id, shape, row_idx, has_btn)].""" headers = await page.eval_on_selector_all( f"#{ID['popup_grid']} tr.DataHeader th", "els => els.map(el => el.textContent.trim())", ) log.info(f" Popup columns: {headers}") rows = await page.query_selector_all( f"#{ID['popup_grid']} tr.griditemP, #{ID['popup_grid']} tr.gridaltItemP" ) combos = [] for row_idx, row in enumerate(rows): first_td = await row.query_selector("td[gpid]") if not first_td: continue gid = (await first_td.get_attribute("gpid") or "").strip() gname = (await first_td.get_attribute("gpname") or "").strip() tds = await row.query_selector_all("td") for col_idx, td in enumerate(tds): if col_idx == 0: continue shape = headers[col_idx] if col_idx < len(headers) else "" img = await td.query_selector("img[src*='blue_button']") combos.append((gname, gid, shape, row_idx, img is not None)) active = sum(1 for c in combos if c[4]) log.info(f" {active} active grade/shape combos") return combos async def click_shape(page: Page, shape: str, row_idx: int) -> bool: """Click a shape button in the popup grid; wait for dims panel.""" arg = f"{shape}${row_idx}" link = await page.query_selector( f"#{ID['popup_grid']} a[href*=\"'{arg}'\"] img[src*='blue_button']" ) if not link: try: await do_postback(page, PB["popup_grid"], arg) except Exception: log.warning(f" Could not click shape {arg}") return False else: parent = await link.evaluate_handle("el => el.parentElement") await parent.as_element().click() try: # Wait for the DimA dropdown to appear (the real indicator of dims panel loaded) await page.wait_for_selector(f"#{ID['dim_a']}", state="attached", timeout=TIMEOUT) await wait_for_update(page) return True except PwTimeout: # Check if panel has any content at all html = await page.inner_html(f"#{ID['dims_panel']}") if len(html.strip()) > 50: await wait_for_update(page) return True log.warning(f" Dims panel timeout for {arg}") return False async def click_back(page: Page): """Click Back to return to the popup grid view.""" try: await do_postback(page, PB["back_btn"], "") await wait_for_update(page) await asyncio.sleep(DELAY) except Exception as e: log.warning(f" Back button error: {e}") async def close_popup(page: Page): """Close the popup window and return to the main grid.""" try: await do_postback(page, PB["popup"], "Close") await wait_for_update(page) await asyncio.sleep(DELAY) except Exception as e: log.warning(f" Close popup error: {e}") # ═══════════════════════════════════════════════════════════════════════ # Level 3 — Dimension Panel Scraping # ═══════════════════════════════════════════════════════════════════════ async def get_select_options(page: Page, sel_id: str): """Return [(value, text), ...] for a