inversionitasbot/scrapper/autorsearcher.py

import json
import requests
import sys
from bs4 import BeautifulSoup

def download_html_as_human(url):
    """
    Descarga el HTML de una página web simulando un navegador real y usando cookies de sesión.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }

    session = requests.Session()

    response = session.get(url, headers=headers)

    if response.status_code == 200:
        return response.text
    else:
        return None

def extract_author_from_json(json_data):
    """
    Extrae el autor del JSON-LD, incluso si está en una lista.
    """
    if isinstance(json_data, list):
        for item in json_data:
            author = extract_author_from_json(item)
            if author:
                return author
    elif isinstance(json_data, dict):
        if 'author' in json_data:
            author_data = json_data['author']
            if isinstance(author_data, list):
                for author in author_data:
                    if isinstance(author, dict) and 'name' in author:
                        return author['name']
            elif isinstance(author_data, dict) and 'name' in author_data:
                return author_data['name']
    return None

def get_author_from_json_ld(soup):
    """
    Extrae el autor de los metadatos JSON-LD, considerando estructuras con listas y objetos.
    """
    scripts = soup.find_all('script', type='application/ld+json')
    for script in scripts:
        try:
            json_data = json.loads(script.string)
            author = extract_author_from_json(json_data)
            if author:
                return author
        except json.JSONDecodeError:
            continue
    return None

def get_author_from_meta(soup):
    """
    Extrae el autor de la etiqueta <meta> con el atributo property="nrbi:authors".
    """
    meta_author = soup.find('meta', property='nrbi:authors')
    if meta_author and 'content' in meta_author.attrs:
        return meta_author['content']
    return None

def get_author_from_url(url):
    """
    Busca el autor en los metadatos JSON-LD y en la etiqueta <meta> de una URL.
    """
    html_content = download_html_as_human(url)
    if not html_content:
        print("error")
        return "No se pudo descargar la página."

    soup = BeautifulSoup(html_content, 'html.parser')

    author = get_author_from_json_ld(soup)
    if author:
        return author

    author = get_author_from_meta(soup)
    if author:
        return author

    return "Autor no encontrado en los metadatos."

if __name__ == "__main__":
    if len(sys.argv) > 1:
        url = sys.argv[1]
        print(get_author_from_url(url))
    else:
        print("Uso: python autorsearcher.py <URL>")