wiseflow/test/fetching_for_sample.py

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import os
import json
import asyncio
from urllib.parse import urlparse, urljoin
import hashlib
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavigationContext
from datetime import timedelta


sites = ["https://www.gd121.cn/zx/qxzx/list.shtml",
]

os.environ['CRAWLEE_STORAGE_DIR'] = 'webpage_samples/crawlee_storage'
save_dir = 'webpage_samples'

async def main(sites: list):
    crawler = PlaywrightCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        # max_requests_per_crawl=1,
        max_request_retries=1,
        request_handler_timeout=timedelta(minutes=5)
    )

    @crawler.pre_navigation_hook
    async def log_navigation_url(context: PlaywrightPreNavigationContext) -> None:
        context.log.info(f'navigating {context.request.url} ...')

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        await context.page.wait_for_load_state('networkidle')
        await context.page.wait_for_timeout(2000)

        # Handle dialogs (alerts, confirms, prompts)
        async def handle_dialog(dialog):
            context.log.info(f'Closing dialog: {dialog.message}')
            await dialog.accept()
        context.page.on('dialog', handle_dialog)

        context.log.info('successfully finish fetching')

        folder = os.path.join(save_dir, f"{hashlib.sha256(context.request.url.encode()).hexdigest()[-6:]}")
        os.makedirs(folder, exist_ok=True)

        html = await context.page.inner_html('head')
        soup = BeautifulSoup(html, 'html.parser')
        web_title = soup.find('title')
        if web_title:
            web_title = web_title.get_text().strip()
        else:
            web_title = ''

        base_tag = soup.find('base', href=True)
        if base_tag and base_tag.get('href'):
            base_url = base_tag['href']
        else:
            # if no base tag, use the current url as base url
            parsed_url = urlparse(context.request.url)
            domain = parsed_url.netloc
            base_url = f"{parsed_url.scheme}://{domain}"

        html = await context.page.inner_html('body')

        # to use a customer scaper here

        soup = BeautifulSoup(html, 'html.parser')

        # 移除导航、页眉、页脚等通用元素
        for selector in ['div#nav', 'div.header', 'div#footer', 'nav', 'header', 'footer']:
            elements = soup.select(selector)
            for element in elements:
                element.decompose()

        action_dict = {}
        for form in soup.find_all('form', recursive=True):
            form_dict = {}
            for input_elem in form.find_all('input'):
                input_type = input_elem.get('type', 'text')
                input_name = input_elem.get('name', f'input_{len(action_dict)}')
                input_value = ' '.join([f"{k}={v}" for k, v in input_elem.attrs.items() if k not in ['type', 'name', 'form']])
                input_dict = {
                    "type": input_type,
                    "values": [input_value] if input_value else []
                }

                # handle datalist
                if input_elem.get('list'):
                    datalist = soup.find('datalist', id=input_elem['list'])
                    if datalist:
                        options = [opt.get('value', opt.text.strip()) for opt in datalist.find_all('option')]
                        input_dict = {
                            "type": "text",
                            "values": [f"one of followings: {options}"]
                        }

                form_dict[input_name] = input_dict

            for select in form.find_all('select'):
                select_name = select.get('name', f'select_{len(form_dict)}')
                options = [opt.get('value', opt.text.strip()) for opt in select.find_all('option')]
                form_dict[select_name] = {
                    "type": "select",
                    "values": options
                }

            for textarea in form.find_all('textarea'):
                textarea_name = textarea.get('name', f'textarea_{len(form_dict)}')
                form_dict[textarea_name] = {
                    "type": "textarea",
                    "values": [textarea.text.strip()]
                }

            if form_dict:
                form_id = form.get('id', f'form_{len(action_dict)}')
                action_dict[form_id] = form_dict

            form.decompose()

        # handle input elements that are not in any form
        for input_elem in soup.find_all('input', recursive=True):
            if input_elem.find_parent('form') is None:
                # check if the input is associated with a form by form attribute
                form_ids = input_elem.get('form', '').split()

                # handle input element
                input_type = input_elem.get('type', 'text')
                input_name = input_elem.get('name', f'input_{len(action_dict)}')
                input_value = ' '.join([f"{k}={v}" for k, v in input_elem.attrs.items() if k not in ['type', 'name', 'form']])
                input_dict = {
                    "type": input_type,
                    "values": [input_value] if input_value else []
                }

                # handle datalist
                if input_elem.get('list'):
                    datalist = soup.find('datalist', id=input_elem['list'])
                    if datalist:
                        options = [opt.get('value', opt.text.strip()) for opt in datalist.find_all('option')]
                        input_dict = {
                            "type": "text",
                            "values": [f"one of followings: {options}"]
                        }
                # decide the placement of the input element based on form attribute
                if form_ids:
                    for form_id in form_ids:
                        if form_id in action_dict:
                            action_dict[form_id][input_name] = input_dict
                        else:
                            action_dict[form_id] = {input_name: input_dict}
                else:
                    action_dict[input_name] = {"input": input_dict}

            input_elem.decompose()

        for button in soup.find_all(['button', 'input[type="button"]', 'input[type="submit"]'], recursive=True):
            button_name = button.get('name', '') or button.get('id', '') or button.text.strip()
            if not button_name:
                button_name = f'button_{len(action_dict)}'

            button_type = button.get('type', 'button')
            button_value = button.get('value', button.text.strip())

            action_dict[button_name] = {
                "button": {
                    "type": button_type,
                    "values": [button_value] if button_value else []
                }
            }

            button.decompose()

        for command in soup.find_all('command', recursive=True):
            command_name = command.get('name', '') or command.get('id', '') or command.text.strip()
            if not command_name:
                command_name = f'command_{len(action_dict)}'

            command_type = command.get('type', 'command')
            command_value = command.get('value', command.text.strip())

            action_dict[command_name] = {
                "command": {
                    "type": command_type,
                    "values": [command_value] if command_value else []
                }
            }

            command.decompose()

        link_dict = {}
        for img in soup.find_all('img', src=True, recursive=True):
            src = img.get('src')
            if src.startswith('#') or src.startswith('about:blank'):
                src = None
            text = img.get('alt', '').strip()
            if src:
                if not src.startswith(('http://', 'https://')):
                    src = urljoin(base_url, src)
                key = f"url{len(link_dict)}"
                link_dict[key] = src
                text = f"{text}<img>[{key}]"

            # find all area urls related to this img
            area_urls = set()
            if img.get('usemap'):
                # remove the # at the beginning of the map name
                map_name = img.get('usemap').lstrip('#')
                # find the map tag
                map_tag = soup.find('map', {'name': map_name})
                if map_tag:
                    # get all area tags under the map
                    for area in map_tag.find_all('area', href=True):
                        area_href = area.get('href')
                        if area_href.startswith('javascript:') or area_href.startswith('#') or area_href.startswith('mailto:') or area_href.startswith('data:') or area_href.startswith('about:blank'):
                            area_href = None
                        if area_href:
                            if not area_href.startswith(('http://', 'https://')):
                                area_href = urljoin(base_url, area_href)
                            area_urls.add(area_href)
                            area.decompose()
                    # delete the whole map tag
                    map_tag.decompose()
            for area_url in area_urls:
                if area_url in [context.request.url, base_url]:
                    continue
                key = f"url{len(link_dict)}"
                link_dict[key] = area_url
                text = f"{text}[{key}]"

            img.replace_with(f"-{text}")

        for media in soup.find_all(['video', 'audio', 'source', 'embed', 'iframe', 'figure'], src=True, recursive=True):
            src = media.get('src')
            if src.startswith('javascript:') or src.startswith('#') or src.startswith('mailto:') or src.startswith('data:') or src.startswith('about:blank'):
                src = None
            text = media.get('alt', '').strip() or media.get_text().strip()
            if src:
                # convert relative path to full url
                if not src.startswith(('http://', 'https://')):
                    src = urljoin(context.request.url, src)
                key = f"url{len(link_dict)}"
                link_dict[key] = src
                ext = os.path.splitext(src)[1].lstrip('.') or media.name
                text = f"{text}<{ext}>[{key}]"

            media.replace_with(f"-{text}")

        for obj in soup.find_all('object', data=True, recursive=True):
            data = obj.get('data')
            if data.startswith('javascript:') or data.startswith('#') or data.startswith('mailto:') or data.startswith('data:') or data.startswith('about:blank'):
                data = None
            text = obj.get('title', '').strip() or obj.get_text().strip()
            if data:
                # convert relative path to full url
                if not data.startswith(('http://', 'https://')):
                    data = urljoin(context.request.url, data)
                key = f"url{len(link_dict)}"
                link_dict[key] = data
                ext = os.path.splitext(data)[1].lstrip('.') or 'object'
                text = f"{text}<{ext}>[{key}]"

            obj.replace_with(f"-{text}")

        # process links at last, so that we can keep the image and media info in the link
        for a in soup.find_all('a', href=True, recursive=True):
            href = a.get('href')
            if href.startswith('javascript:') or href.startswith('#') or href.startswith('mailto:') or href.startswith('data:') or href.startswith('about:blank'):
                href = None
            if href:
                text = a.get_text().strip() or '-'
                if not href.startswith(('http://', 'https://')):
                    href = urljoin(context.request.url, href)
                if href in [context.request.url, base_url]:
                    continue
                key = f"url{len(link_dict)}"
                link_dict[key] = href
                a.replace_with(f"{text}[{key}]")

        # handle headings
        for i in range(1, 7):  # h1 到 h6
            for heading in soup.find_all(f'h{i}', recursive=False):
                text = heading.get_text().strip()
                heading.replace_with(f"{'#' * i} {text}\n")

        # replace all <br> and <br/> tags with newlines
        for br in soup.find_all(['br', 'br/', 'br /', 'hr', 'hr/', 'hr /', 'wbr'], recursive=True):
            br.replace_with('\n')

        # handle lists
        for list_tag in soup.find_all(['ul', 'ol'], recursive=True):
            list_text = []
            for idx, item in enumerate(list_tag.find_all('li')):
                list_text.append(f"{idx + 1}. {item.get_text().strip()}")
            list_text = '\t'.join(list_text)
            list_tag.replace_with(f"{list_text}\n")

        # handle spans - merge span text with surrounding text
        for span in soup.find_all('span', recursive=True):
            span.replace_with(span.get_text().strip())

        # handle strikethrough text
        for del_tag in soup.find_all(['del', 's'], recursive=True):
            del_text = del_tag.get_text().strip()
            if del_text:
                del_tag.replace_with(f"{del_text}(maybe_outdated)")
            else:
                del_tag.decompose()

        # handle tables
        for table in soup.find_all('table', recursive=True):
            table_text = []

            # handle caption
            caption = table.find('caption')
            if caption:
                table_text.append(caption.get_text().strip())

            # get headers
            headers = []
            for th in table.find_all('th'):
                headers.append(th.get_text().strip())

            # handle all rows (including tbody and tfoot)
            for row in table.find_all('tr'):
                # get the first cell value
                # try to find th as first_val
                first_cell = row.find(['th', 'td'])
                if not first_cell:
                    continue
                first_val = first_cell.get_text().strip()
                cells = row.find_all('td')
                if not cells:
                    continue

                # handle remaining cells
                for idx, cell in enumerate(cells):
                    cell_text = cell.get_text().strip()
                    if not cell_text or cell_text == first_val:
                        continue

                    header_text = headers[idx] if idx < len(headers) else ''
                    cell_str = f"{first_val}-{header_text}-{cell_text}"
                    table_text.append(cell_str)

            # replace the table with the processed text
            table_text = '\n'.join(table_text)
            table.replace_with(f"\n{table_text}\n")

        html_text = soup.get_text(strip=False, separator='\n')

        with open(os.path.join(folder, 'text.txt'), 'w') as f:
            f.write(html_text)

        with open(os.path.join(folder, 'link_dict.json'), 'w', encoding='utf-8') as f:
            json.dump(link_dict, f, indent=4, ensure_ascii=False)

        with open(os.path.join(folder, 'action_dict.json'), 'w', encoding='utf-8') as f:
            json.dump(action_dict, f, indent=4, ensure_ascii=False)

        # screenshot_file = os.path.join(folder, 'screenshot.jpg')
        # await context.page.screenshot(path=screenshot_file, full_page=True)

    await crawler.run(sites)

if __name__ == '__main__':
    asyncio.run(main(sites))