99 lines
3.0 KiB
Python
99 lines
3.0 KiB
Python
import json
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import logging
|
|
|
|
# Configuración del logging
|
|
LOG_FILE = "app.log"
|
|
logging.basicConfig(
|
|
filename=LOG_FILE, # Archivo de logs
|
|
level=logging.INFO, # Nivel de logging (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
|
format="%(asctime)s - %(levelname)s - %(message)s", # Formato de los logs
|
|
)
|
|
|
|
def download_html_as_human(url):
|
|
"""
|
|
Descarga el HTML de una página web simulando un navegador real y usando cookies de sesión.
|
|
"""
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
|
|
}
|
|
|
|
session = requests.Session()
|
|
|
|
response = session.get(url, headers=headers)
|
|
|
|
if response.status_code == 200:
|
|
return response.text
|
|
else:
|
|
return None
|
|
|
|
def extract_author_from_json(json_data):
|
|
"""
|
|
Extrae el autor del JSON-LD, incluso si está en una lista.
|
|
"""
|
|
if isinstance(json_data, list):
|
|
for item in json_data:
|
|
author = extract_author_from_json(item)
|
|
if author:
|
|
return author
|
|
elif isinstance(json_data, dict):
|
|
if 'author' in json_data:
|
|
author_data = json_data['author']
|
|
if isinstance(author_data, list):
|
|
for author in author_data:
|
|
if isinstance(author, dict) and 'name' in author:
|
|
return author['name']
|
|
elif isinstance(author_data, dict) and 'name' in author_data:
|
|
return author_data['name']
|
|
return None
|
|
|
|
def get_author_from_json_ld(soup):
|
|
"""
|
|
Extrae el autor de los metadatos JSON-LD, considerando estructuras con listas y objetos.
|
|
"""
|
|
scripts = soup.find_all('script', type='application/ld+json')
|
|
for script in scripts:
|
|
try:
|
|
json_data = json.loads(script.string)
|
|
author = extract_author_from_json(json_data)
|
|
if author:
|
|
return author
|
|
except json.JSONDecodeError:
|
|
continue
|
|
return None
|
|
|
|
def get_author_from_meta(soup):
|
|
"""
|
|
Extrae el autor de la etiqueta <meta> con el atributo property="nrbi:authors".
|
|
"""
|
|
meta_author = soup.find('meta', property='nrbi:authors')
|
|
if meta_author and 'content' in meta_author.attrs:
|
|
return meta_author['content']
|
|
return None
|
|
|
|
def get_author_from_url(url):
|
|
"""
|
|
Busca el autor en los metadatos JSON-LD y en la etiqueta <meta> de una URL.
|
|
"""
|
|
html_content = download_html_as_human(url)
|
|
if not html_content:
|
|
logging.info("error, no se pudo descargar la pagina")
|
|
return "No se pudo descargar la página."
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
author = get_author_from_json_ld(soup)
|
|
if author:
|
|
logging.info(author)
|
|
return author
|
|
|
|
author = get_author_from_meta(soup)
|
|
if author:
|
|
logging.info(author)
|
|
return author
|
|
|
|
logging.info("No encontrado autor")
|
|
return "Desconocido"
|
|
|