From 865a9459a6e7f7de5947d05941906dafada30dc1 Mon Sep 17 00:00:00 2001 From: Emanuel Almeida Date: Tue, 28 Apr 2026 11:52:17 +0100 Subject: [PATCH] =?UTF-8?q?feat(scraper):=20adicionar=20scrapers=20Bizin.e?= =?UTF-8?q?u=20v1+v2=20+=20triangula=C3=A7=C3=A3o=20Desk=20#2055?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - bizin_scraper.py: undetected-chromedriver + Selenium headless - bizin_scraper_v2.py: curl_cffi impersonação Chrome110 - .desk-project: triangulação task #2055 / projecto DES 360º Co-Authored-By: Claude Sonnet 4.6 --- .desk-project | 5 ++ scraper/bizin_scraper.py | 169 ++++++++++++++++++++++++++++++++++++ scraper/bizin_scraper_v2.py | 141 ++++++++++++++++++++++++++++++ 3 files changed, 315 insertions(+) create mode 100644 .desk-project create mode 100644 scraper/bizin_scraper.py create mode 100644 scraper/bizin_scraper_v2.py diff --git a/.desk-project b/.desk-project new file mode 100644 index 0000000..b7bee19 --- /dev/null +++ b/.desk-project @@ -0,0 +1,5 @@ +desk_task=2055 +desk_project=58 +desk_project_name=DES 360º +repo=https://git.descomplicar.pt/ealmeida/Scripts +pasta=/media/ealmeida/Dados/Dev/Scripts/ diff --git a/scraper/bizin_scraper.py b/scraper/bizin_scraper.py new file mode 100644 index 0000000..8f3eaef --- /dev/null +++ b/scraper/bizin_scraper.py @@ -0,0 +1,169 @@ +""" +bizin_scraper.py - Scraper Avançado para Bizin.pt +Usa undetected-chromedriver para contornar Cloudflare agressivo. +Extrai dados completos de empresas e verifica sites externos. + +Author: Descomplicar® Crescimento Digital +Copyright: 2026 Descomplicar® +""" + +import csv +import re +import time +import os +from pathlib import Path +from urllib.parse import urljoin +import undetected_chromedriver as uc +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import TimeoutException, NoSuchElementException + +# --- CONFIGURAÇÕES --- +BASE_URL = "https://pt.bizin.eu/por/" +OUTPUT_CSV = Path(__file__).parent / "output/bizin_empresas.csv" +EMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' + +def get_driver(): + """Configura e retorna um driver UC camuflado.""" + options = uc.ChromeOptions() + options.add_argument('--headless') # Tentar headless com UC (muito eficaz) + options.add_argument('--no-sandbox') + options.add_argument('--disable-dev-shm-usage') + + # Criar o driver + driver = uc.Chrome(options=options) + return driver + +def scrape_external_site(driver, url): + """Visita o site externo para verificar status e extrair emails.""" + if not url or 'bizin.eu' in url: + return {"Site_Status": "N/A", "Emails_Site": "", "Contactos_Site": ""} + + if not url.startswith('http'): + url = 'https://' + url + + try: + print(f"🌍 A visitar site externo: {url}") + driver.get(url) + time.sleep(5) # Esperar carregamento + + page_source = driver.page_source + emails = list(set(re.findall(EMAIL_REGEX, page_source))) + email_str = "; ".join(emails[:5]) + + phone_regex = r'(?:\+351|00351)?[\s.-]?[239]\d{2}[\s.-]?\d{3}[\s.-]?\d{3}' + phones = list(set(re.findall(phone_regex, page_source))) + phones_str = "; ".join(phones[:3]) + + return { + "Site_Status": "Ativo", + "Emails_Site": email_str, + "Contactos_Site": phones_str + } + except Exception as e: + print(f"⚠️ Erro ao aceder {url}: {e}") + return {"Site_Status": "Inativo/Erro", "Emails_Site": "", "Contactos_Site": ""} + +def extract_company_details(driver, company_url): + """Extrai detalhes da página da empresa no Bizin.""" + try: + print(f"🏢 A processar empresa: {company_url}") + driver.get(company_url) + time.sleep(5) # Pausa para Cloudflare + + # Extrair dados via Selenium + try: + name = driver.find_element(By.TAG_NAME, "h1").text + except: + name = "N/A" + + try: + # Procurar links de telefone + phone_elem = driver.find_element(By.XPATH, "//a[contains(@href, 'tel:')]") + phone_bizin = phone_elem.text + except: + phone_bizin = "N/A" + + try: + # Morada (procurar por padrões comuns no Bizin) + address = driver.find_element(By.XPATH, "//*[contains(@class, 'address')]").text + except: + address = "N/A" + + # Tentar encontrar Website + website = None + links = driver.find_elements(By.TAG_NAME, "a") + for link in links: + href = link.get_attribute("href") + if href and href.startswith("http") and "bizin.eu" not in href and "facebook.com" not in href: + website = href + break + + data = { + "Nome": name.strip(), + "Telefone_Bizin": phone_bizin.strip(), + "Morada": address.strip().replace('\n', ' '), + "Website_Externo": website, + "URL_Bizin": company_url + } + + if website: + data.update(scrape_external_site(driver, website)) + # Voltar para o Bizin para continuar + driver.get(company_url) + else: + data.update({"Site_Status": "N/A", "Emails_Site": "", "Contactos_Site": ""}) + + return data + except Exception as e: + print(f"❌ Erro em {company_url}: {e}") + return None + +def main(): + OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True) + driver = get_driver() + + print(f"🚀 Iniciando scraping de {BASE_URL}") + try: + driver.get(BASE_URL) + time.sleep(10) # Tempo para passar Cloudflare inicial + + # Obter links de empresas + links_elements = driver.find_elements(By.TAG_NAME, "a") + company_urls = [] + for elem in links_elements: + href = elem.get_attribute("href") + if href and '/por/' in href and len(href.split('-')) > 4: + if href not in company_urls: + company_urls.append(href) + + # Filtrar a própria base URL + company_urls = [u for u in company_urls if u != BASE_URL] + + company_urls = company_urls[:15] # Limite para teste inicial + print(f"✅ Encontradas {len(company_urls)} empresas para teste.") + + except Exception as e: + print(f"❌ Erro inicial: {e}") + driver.quit() + return + + results_count = 0 + for url in company_urls: + details = extract_company_details(driver, url) + if details: + results_count += 1 + write_mode = 'w' if results_count == 1 else 'a' + with open(OUTPUT_CSV, write_mode, newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=details.keys()) + if write_mode == 'w': + writer.writeheader() + writer.writerow(details) + time.sleep(3) + + print(f"🏁 Concluído! {results_count} empresas em {OUTPUT_CSV}") + driver.quit() + +if __name__ == "__main__": + main() diff --git a/scraper/bizin_scraper_v2.py b/scraper/bizin_scraper_v2.py new file mode 100644 index 0000000..e6573e6 --- /dev/null +++ b/scraper/bizin_scraper_v2.py @@ -0,0 +1,141 @@ +""" +bizin_scraper_v2.py - Scraper de Emergência para Bizin.pt +Usa curl_cffi com técnica de impersonation profunda. + +Author: Descomplicar® Crescimento Digital +Copyright: 2026 Descomplicar® +""" + +import csv +import re +import time +import json +from pathlib import Path +from urllib.parse import urljoin +from curl_cffi import requests +from bs4 import BeautifulSoup + +# --- CONFIGURAÇÕES --- +# Vamos tentar uma página de listagem direta em vez da home +BASE_URL = "https://pt.bizin.eu/por/Lisboa-1069" +OUTPUT_CSV = Path(__file__).parent / "output/bizin_empresas.csv" +EMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' + +def get_session(): + return requests.Session(impersonate="chrome110") + +def scrape_site(url, is_external=False): + """Faz o request e tenta contornar o bloqueio.""" + s = get_session() + headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", + "Accept-Language": "pt-PT,pt;q=0.9,en-US;q=0.8,en;q=0.7", + "Cache-Control": "max-age=0", + "Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": '"Windows"', + "Upgrade-Insecure-Requests": "1", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + } + try: + resp = s.get(url, headers=headers, timeout=20) + if "Just a moment..." in resp.text: + print(f"❌ Bloqueado pelo Cloudflare em {url}") + return None + return resp.text + except Exception as e: + print(f"⚠️ Erro em {url}: {e}") + return None + +def extract_company_details(html, url): + """Extrai campos da página da empresa.""" + soup = BeautifulSoup(html, 'html.parser') + + # Extração de campos baseada no teu PDF/Screenshot + name = soup.find('h1').text.strip() if soup.find('h1') else "N/A" + + # Procurar por etiquetas de dados (Telefone, Morada, etc) + details = {} + + # Tentar encontrar o bloco de contactos + contact_block = soup.find(id='contacts') or soup.find(class_='contacts') + + # Telefone (procurar link tel:) + phone = "N/A" + phone_elem = soup.find('a', href=re.compile(r'^tel:')) + if phone_elem: + phone = phone_elem.get_text(strip=True) + + # Morada + address = "N/A" + addr_elem = soup.find('address') or soup.find(class_='address') + if addr_elem: + address = addr_elem.get_text(strip=True).replace('\n', ' ') + + # Website + website = None + for link in soup.find_all('a', href=True): + href = link['href'] + if href.startswith('http') and 'bizin.eu' not in href and 'facebook' not in href: + website = href + break + + data = { + "Empresa": name, + "Telefone_Bizin": phone, + "Morada": address, + "Website": website, + "URL_Bizin": url, + "Status_Site": "N/A", + "Emails_Encontrados": "", + "Telefones_Encontrados": "" + } + + # Visitar site externo se existir + if website: + ext_html = scrape_site(website, is_external=True) + if ext_html: + emails = list(set(re.findall(EMAIL_REGEX, ext_html))) + data["Emails_Encontrados"] = "; ".join(emails[:5]) + + phone_regex = r'(?:\+351|00351)?[\s.-]?[239]\d{2}[\s.-]?\d{3}[\s.-]?\d{3}' + phones = list(set(re.findall(phone_regex, ext_html))) + data["Telefones_Encontrados"] = "; ".join(phones[:3]) + data["Status_Site"] = "Ativo" + else: + data["Status_Site"] = "Inativo/Erro" + + return data + +def main(): + OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True) + print(f"🚀 Iniciando scraping em {BASE_URL}...") + + html = scrape_site(BASE_URL) + if not html: + print("🛑 Não foi possível aceder à lista inicial. Tente novamente mais tarde ou mude o IP.") + return + + soup = BeautifulSoup(html, 'html.parser') + links = [urljoin(BASE_URL, a['href']) for a in soup.find_all('a', href=True) if '/por/' in a['href'] and len(a['href'].split('-')) > 4] + company_urls = list(dict.fromkeys(links))[:20] + + print(f"✅ Encontradas {len(company_urls)} empresas.") + + with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f: + writer = None + for url in company_urls: + comp_html = scrape_site(url) + if comp_html: + details = extract_company_details(comp_html, url) + if not writer: + writer = csv.DictWriter(f, fieldnames=details.keys()) + writer.writeheader() + writer.writerow(details) + print(f"💾 Guardado: {details['Empresa']}") + time.sleep(3) # Pausa estratégica + + print(f"🏁 Concluído! Dados em {OUTPUT_CSV}") + +if __name__ == "__main__": + main()