""" bizin_scraper.py - Scraper Avançado para Bizin.pt Usa undetected-chromedriver para contornar Cloudflare agressivo. Extrai dados completos de empresas e verifica sites externos. Author: Descomplicar® Crescimento Digital Copyright: 2026 Descomplicar® """ import csv import re import time import os from pathlib import Path from urllib.parse import urljoin import undetected_chromedriver as uc from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException, NoSuchElementException # --- CONFIGURAÇÕES --- BASE_URL = "https://pt.bizin.eu/por/" OUTPUT_CSV = Path(__file__).parent / "output/bizin_empresas.csv" EMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' def get_driver(): """Configura e retorna um driver UC camuflado.""" options = uc.ChromeOptions() options.add_argument('--headless') # Tentar headless com UC (muito eficaz) options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') # Criar o driver driver = uc.Chrome(options=options) return driver def scrape_external_site(driver, url): """Visita o site externo para verificar status e extrair emails.""" if not url or 'bizin.eu' in url: return {"Site_Status": "N/A", "Emails_Site": "", "Contactos_Site": ""} if not url.startswith('http'): url = 'https://' + url try: print(f"🌍 A visitar site externo: {url}") driver.get(url) time.sleep(5) # Esperar carregamento page_source = driver.page_source emails = list(set(re.findall(EMAIL_REGEX, page_source))) email_str = "; ".join(emails[:5]) phone_regex = r'(?:\+351|00351)?[\s.-]?[239]\d{2}[\s.-]?\d{3}[\s.-]?\d{3}' phones = list(set(re.findall(phone_regex, page_source))) phones_str = "; ".join(phones[:3]) return { "Site_Status": "Ativo", "Emails_Site": email_str, "Contactos_Site": phones_str } except Exception as e: print(f"⚠️ Erro ao aceder {url}: {e}") return {"Site_Status": "Inativo/Erro", "Emails_Site": "", "Contactos_Site": ""} def extract_company_details(driver, company_url): """Extrai detalhes da página da empresa no Bizin.""" try: print(f"🏢 A processar empresa: {company_url}") driver.get(company_url) time.sleep(5) # Pausa para Cloudflare # Extrair dados via Selenium try: name = driver.find_element(By.TAG_NAME, "h1").text except: name = "N/A" try: # Procurar links de telefone phone_elem = driver.find_element(By.XPATH, "//a[contains(@href, 'tel:')]") phone_bizin = phone_elem.text except: phone_bizin = "N/A" try: # Morada (procurar por padrões comuns no Bizin) address = driver.find_element(By.XPATH, "//*[contains(@class, 'address')]").text except: address = "N/A" # Tentar encontrar Website website = None links = driver.find_elements(By.TAG_NAME, "a") for link in links: href = link.get_attribute("href") if href and href.startswith("http") and "bizin.eu" not in href and "facebook.com" not in href: website = href break data = { "Nome": name.strip(), "Telefone_Bizin": phone_bizin.strip(), "Morada": address.strip().replace('\n', ' '), "Website_Externo": website, "URL_Bizin": company_url } if website: data.update(scrape_external_site(driver, website)) # Voltar para o Bizin para continuar driver.get(company_url) else: data.update({"Site_Status": "N/A", "Emails_Site": "", "Contactos_Site": ""}) return data except Exception as e: print(f"❌ Erro em {company_url}: {e}") return None def main(): OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True) driver = get_driver() print(f"🚀 Iniciando scraping de {BASE_URL}") try: driver.get(BASE_URL) time.sleep(10) # Tempo para passar Cloudflare inicial # Obter links de empresas links_elements = driver.find_elements(By.TAG_NAME, "a") company_urls = [] for elem in links_elements: href = elem.get_attribute("href") if href and '/por/' in href and len(href.split('-')) > 4: if href not in company_urls: company_urls.append(href) # Filtrar a própria base URL company_urls = [u for u in company_urls if u != BASE_URL] company_urls = company_urls[:15] # Limite para teste inicial print(f"✅ Encontradas {len(company_urls)} empresas para teste.") except Exception as e: print(f"❌ Erro inicial: {e}") driver.quit() return results_count = 0 for url in company_urls: details = extract_company_details(driver, url) if details: results_count += 1 write_mode = 'w' if results_count == 1 else 'a' with open(OUTPUT_CSV, write_mode, newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=details.keys()) if write_mode == 'w': writer.writeheader() writer.writerow(details) time.sleep(3) print(f"🏁 Concluído! {results_count} empresas em {OUTPUT_CSV}") driver.quit() if __name__ == "__main__": main()