""" bizin_scraper_v2.py - Scraper de Emergência para Bizin.pt Usa curl_cffi com técnica de impersonation profunda. Author: Descomplicar® Crescimento Digital Copyright: 2026 Descomplicar® """ import csv import re import time import json from pathlib import Path from urllib.parse import urljoin from curl_cffi import requests from bs4 import BeautifulSoup # --- CONFIGURAÇÕES --- # Vamos tentar uma página de listagem direta em vez da home BASE_URL = "https://pt.bizin.eu/por/Lisboa-1069" OUTPUT_CSV = Path(__file__).parent / "output/bizin_empresas.csv" EMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' def get_session(): return requests.Session(impersonate="chrome110") def scrape_site(url, is_external=False): """Faz o request e tenta contornar o bloqueio.""" s = get_session() headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "pt-PT,pt;q=0.9,en-US;q=0.8,en;q=0.7", "Cache-Control": "max-age=0", "Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": '"Windows"', "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } try: resp = s.get(url, headers=headers, timeout=20) if "Just a moment..." in resp.text: print(f"❌ Bloqueado pelo Cloudflare em {url}") return None return resp.text except Exception as e: print(f"⚠️ Erro em {url}: {e}") return None def extract_company_details(html, url): """Extrai campos da página da empresa.""" soup = BeautifulSoup(html, 'html.parser') # Extração de campos baseada no teu PDF/Screenshot name = soup.find('h1').text.strip() if soup.find('h1') else "N/A" # Procurar por etiquetas de dados (Telefone, Morada, etc) details = {} # Tentar encontrar o bloco de contactos contact_block = soup.find(id='contacts') or soup.find(class_='contacts') # Telefone (procurar link tel:) phone = "N/A" phone_elem = soup.find('a', href=re.compile(r'^tel:')) if phone_elem: phone = phone_elem.get_text(strip=True) # Morada address = "N/A" addr_elem = soup.find('address') or soup.find(class_='address') if addr_elem: address = addr_elem.get_text(strip=True).replace('\n', ' ') # Website website = None for link in soup.find_all('a', href=True): href = link['href'] if href.startswith('http') and 'bizin.eu' not in href and 'facebook' not in href: website = href break data = { "Empresa": name, "Telefone_Bizin": phone, "Morada": address, "Website": website, "URL_Bizin": url, "Status_Site": "N/A", "Emails_Encontrados": "", "Telefones_Encontrados": "" } # Visitar site externo se existir if website: ext_html = scrape_site(website, is_external=True) if ext_html: emails = list(set(re.findall(EMAIL_REGEX, ext_html))) data["Emails_Encontrados"] = "; ".join(emails[:5]) phone_regex = r'(?:\+351|00351)?[\s.-]?[239]\d{2}[\s.-]?\d{3}[\s.-]?\d{3}' phones = list(set(re.findall(phone_regex, ext_html))) data["Telefones_Encontrados"] = "; ".join(phones[:3]) data["Status_Site"] = "Ativo" else: data["Status_Site"] = "Inativo/Erro" return data def main(): OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True) print(f"🚀 Iniciando scraping em {BASE_URL}...") html = scrape_site(BASE_URL) if not html: print("🛑 Não foi possível aceder à lista inicial. Tente novamente mais tarde ou mude o IP.") return soup = BeautifulSoup(html, 'html.parser') links = [urljoin(BASE_URL, a['href']) for a in soup.find_all('a', href=True) if '/por/' in a['href'] and len(a['href'].split('-')) > 4] company_urls = list(dict.fromkeys(links))[:20] print(f"✅ Encontradas {len(company_urls)} empresas.") with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f: writer = None for url in company_urls: comp_html = scrape_site(url) if comp_html: details = extract_company_details(comp_html, url) if not writer: writer = csv.DictWriter(f, fieldnames=details.keys()) writer.writeheader() writer.writerow(details) print(f"💾 Guardado: {details['Empresa']}") time.sleep(3) # Pausa estratégica print(f"🏁 Concluído! Dados em {OUTPUT_CSV}") if __name__ == "__main__": main()