encontrado autor y link

This commit is contained in:
2025-02-02 20:52:32 +01:00
parent 4bf3f0c331
commit c6d6724466
7 changed files with 1756 additions and 67 deletions

View File

@ -1,75 +1,93 @@
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import json
import requests
def get_html_without_javascript(url):
import sys
from bs4 import BeautifulSoup
def download_html_as_human(url):
"""
Utiliza Selenium para obtener el HTML de una página web con JavaScript desactivado.
Descarga el HTML de una página web simulando un navegador real y usando cookies de sesión.
"""
# Configuración de opciones de Chrome
chrome_options = Options()
chrome_options.add_argument("--disable-dev-shm-usage") # Usa memoria compartida si no hay suficiente RAM
chrome_options.add_argument("--no-sandbox") # Desactiva el sandboxing (problema común en entornos sin root)
chrome_options.add_argument("--disable-extensions") # Desactiva extensiones del navegador
chrome_options.add_argument("--remote-debugging-port=9222") # Debugging remoto
chrome_options.add_argument("--headless=new") # Nueva implementación de headless
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
session = requests.Session()
# Inicializar el driver de Chrome
driver = webdriver.Chrome(options=chrome_options)
response = session.get(url, headers=headers)
try:
# Accede al sitio web
driver.get(url)
# Obtener el contenido de la página
html_content = driver.page_source
return html_content
finally:
# Cierra el navegador
driver.quit()
if response.status_code == 200:
return response.text
else:
return None
def get_author_from_html(html_content):
def extract_author_from_json(json_data):
"""
Utiliza BeautifulSoup para extraer el autor de una página HTML.
Extrae el autor del JSON-LD, incluso si está en una lista.
"""
try:
# Analizar el HTML con BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
if isinstance(json_data, list):
for item in json_data:
author = extract_author_from_json(item)
if author:
return author
elif isinstance(json_data, dict):
if 'author' in json_data:
author_data = json_data['author']
if isinstance(author_data, list):
for author in author_data:
if isinstance(author, dict) and 'name' in author:
return author['name']
elif isinstance(author_data, dict) and 'name' in author_data:
return author_data['name']
return None
# Buscar la meta tag con name="autor"
author_meta = soup.find('meta', attrs={'name': 'autor'})
# Si existe, devolver el contenido del atributo "content"
if author_meta and 'content' in author_meta.attrs:
return author_meta['content']
else:
return "No se encontró el autor en la meta etiqueta."
except Exception as e:
return f"Error al analizar el HTML: {e}"
# Función principal
def main(url):
def get_author_from_json_ld(soup):
"""
Combina ambas funciones para obtener el autor desde una página con JS desactivado.
Extrae el autor de los metadatos JSON-LD, considerando estructuras con listas y objetos.
"""
try:
# Obtener el HTML sin JavaScript usando Selenium
print("Obteniendo HTML con JavaScript desactivado...")
html_content = get_html_without_javascript(url)
scripts = soup.find_all('script', type='application/ld+json')
for script in scripts:
try:
json_data = json.loads(script.string)
author = extract_author_from_json(json_data)
if author:
return author
except json.JSONDecodeError:
continue
return None
# Guardar el HTML en un archivo local (opcional)
with open("pagina_web_sin_js.html", "w", encoding="utf-8") as file:
file.write(html_content)
def get_author_from_meta(soup):
"""
Extrae el autor de la etiqueta <meta> con el atributo property="nrbi:authors".
"""
meta_author = soup.find('meta', property='nrbi:authors')
if meta_author and 'content' in meta_author.attrs:
return meta_author['content']
return None
# Obtener el autor desde el HTML
print("Analizando HTML para obtener el autor...")
author = get_author_from_html(html_content)
def get_author_from_url(url):
"""
Busca el autor en los metadatos JSON-LD y en la etiqueta <meta> de una URL.
"""
html_content = download_html_as_human(url)
if not html_content:
print("error")
return "No se pudo descargar la página."
soup = BeautifulSoup(html_content, 'html.parser')
author = get_author_from_json_ld(soup)
if author:
return author
except Exception as e:
return f"Error general: {e}"
# Ejemplo de uso
url = "https://www.abc.es/internacional/trump-presidente-vez-era-dorada-empieza-momento-20250120202851-nt.html"
author = main(url)
print(f"El autor es: {author}")
author = get_author_from_meta(soup)
if author:
return author
return "Autor no encontrado en los metadatos."
if __name__ == "__main__":
if len(sys.argv) > 1:
url = sys.argv[1]
print(get_author_from_url(url))
else:
print("Uso: python autorsearcher.py <URL>")

338
cookies.txt Normal file
View File

@ -0,0 +1,338 @@
[
{
"name": "__Secure-1PAPISID",
"value": "l025IpB7PfQpCIwv/AgXIS-5Ulzd7MUUdB",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": true,
"httpOnly": false,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1800808147,
"storeId": "firefox-default",
"id": 1
},
{
"name": "__Secure-1PSID",
"value": "g.a000swjOh6XSGYi19FqGlLMv6ciO98kC_XNfbLONUdm43A1oc1fFTND9FvfSmXKvG7EhWgJ4hgACgYKAZgSARYSFQHGX2MiLo_4je-xWwmTLnlXP5Va_RoVAUF8yKrKCtOBz3FTPT0d-p0dW4vu0076",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": true,
"httpOnly": true,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1800808147,
"storeId": "firefox-default",
"id": 2
},
{
"name": "__Secure-1PSIDCC",
"value": "AKEyXzUd4l-NSvsadu5JrPxgjJm6PxbCCl2-3OgSwJ5HG3aDrfBJW22PV74GirgyFJgYVSu7mw",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": true,
"httpOnly": true,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1769965125,
"storeId": "firefox-default",
"id": 3
},
{
"name": "__Secure-1PSIDTS",
"value": "sidts-CjEBmiPuTRfT43VJY05lEAkfcUHZ3VCWX9aRHEdwZxtXc4LtOH2h0Aq7oxhCy8rqyo10EAA",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": true,
"httpOnly": true,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1769965123,
"storeId": "firefox-default",
"id": 4
},
{
"name": "__Secure-3PAPISID",
"value": "l025IpB7PfQpCIwv/AgXIS-5Ulzd7MUUdB",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": true,
"httpOnly": false,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1800808147,
"storeId": "firefox-default",
"id": 5
},
{
"name": "__Secure-3PSID",
"value": "g.a000swjOh6XSGYi19FqGlLMv6ciO98kC_XNfbLONUdm43A1oc1fFCkMoPKmrDiRPcWjHSIbRNAACgYKAUcSARYSFQHGX2MivW7fEv6cBOgJFPhPcV6qKxoVAUF8yKpKHw_E6y4Avtii2PoRIyw20076",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": true,
"httpOnly": true,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1800808147,
"storeId": "firefox-default",
"id": 6
},
{
"name": "__Secure-3PSIDCC",
"value": "AKEyXzUgCPoQ0O0uww4uLiIAjVvZ8_SNjJEe-NuYvtLsnb9ETZaJyZtTroaByYu-BtNKc62jEfg",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": true,
"httpOnly": true,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1769965125,
"storeId": "firefox-default",
"id": 7
},
{
"name": "__Secure-3PSIDTS",
"value": "sidts-CjEBmiPuTRfT43VJY05lEAkfcUHZ3VCWX9aRHEdwZxtXc4LtOH2h0Aq7oxhCy8rqyo10EAA",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": true,
"httpOnly": true,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1769965123,
"storeId": "firefox-default",
"id": 8
},
{
"name": "__Secure-ENID",
"value": "25.SE=D8_8M9M1VzIRiiZUPSaP4JRb9ppYIQ82KAkJKDUKm7ILjfL-sCTzSniTZJsk6wdgKHys832yLnDxHPdNrtukJqHsJxhc8QAEEHxuf4xz_T3N3YQkMd72sI6vFpU_wFCUyl0rk_OrHf5qB9aIPJEIC7LmjFYYLl4hM4GaC6in_lNLbW6xKBeU2YbkVW66RJDrdC05LltVQBy98rWCTbf1aBKhB75qYUftGtlj3BoGtDsCKdVLXYi-jaQ9j5MetSVC1a0yr58duP9CbRVl8euY5Vga4FiUd6HiqsunYCzn_KgpxK1WuzapPvQ3joXE_4vIx6ebOs2X3I8V4CJ8wFzeygy-MJnCxBYb1rGyuxO4XtQwsEMa8DrJfbx9K69KS6ctzEck51EneWMmspaIbiH5W3S3IX9RUkghqybZ_6y7vWJQNoiQjwx4SeJjOnhy5bIyvXGDykSd8f4McpjGA4qwuFYwGhIF1pyD1A",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": true,
"httpOnly": true,
"sameSite": "lax",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1769946337,
"storeId": "firefox-default",
"id": 9
},
{
"name": "AEC",
"value": "AVcja2el4JQqNhibDkfGyyOdb6_-7RkRMRgq4UAUbFz3ML6a0oPGNEG4Hjc",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": true,
"httpOnly": true,
"sameSite": "lax",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1747499754,
"storeId": "firefox-default",
"id": 10
},
{
"name": "APISID",
"value": "ISowM3NHT-yoAbMx/AIpIkP3Ni3zbhd6-S",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": false,
"httpOnly": false,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1800808147,
"storeId": "firefox-default",
"id": 11
},
{
"name": "HSID",
"value": "Ao9sW4GVwyC96YdSc",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": false,
"httpOnly": true,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1800808147,
"storeId": "firefox-default",
"id": 12
},
{
"name": "NID",
"value": "521=oXspz8mdQQUdk-0i_wzC3i-ZcHMpMECXF-T9zqxjTtRr0SOWciPctQomIqc1tX4OVCd1rMbFZtwuhDuLUzBchZhij6ho5N4iq79YYH39NvhW4014kyGmX2so1ewvALHC7lyWD-Qtb-Wws910_w5llt0hu4_3uoeifnRnmy01gfV_5hCBYyR7tCCtN4cVW3eXE7NeWkMAsfqcw37IdLJjHJWb5IwRrw7dhBKvo0OhgHYaSgkSJFLmvXKaWnze0_6t2-BbwslSmcZFNPA5CJgJGSkc20n1n-zD-hITrO4xCCYcJUE6nD-mjogVhgY3hagAoYRG6E15qOjWLMDoZORqgy9qi6ADsPe0Ebz3w_jWzHvMR9gFNSxkkYzgvLY75KGRX7W0pYjc5SNXecottZRcBYtEvHvtYUA4sfPUW24vDpHbDJFcf7SqScx3cII-i8two4RWqpzyXiN1hkW5GrdFzZ9_rCPa1QwTOYSrQ1RyZO8A7PZD8-e9b7UBZDrjqYhOfUZlx2qfb0lbUy2rl6SB-f376nEZPMHg_P7PYN2jKmgbcsooRenosPt4de8nqj9lgoF4CKYgBwGV3GGbK6_qA1jsfOzSoWyflO1lLmZSVRKeS9hdDjlVyvDizsuvio7lKvb4ukKDh9BXz2T8YHBK4vq96jBc0MU13FjxFRxed9eCRPZLKmMVBvZeG86yg-_wv2f4jxn0FL4SXgMJhTrdZ5YhEeV8jeDI_Yiw63w3YgRBy-D96jxpGPDFG0RKzxIYSfQmtfsTXiHbK5kzcqwnGkG9gxSuDOzU66etxEKrheQf9es0YY1nYgJMNMDcyLHI0rEB6PQIyTPRQ0NnLibJ_jOzfcTtwV1uLzF0ybc-ay6dtTFVk3Whv0xnds6KhHg7IPfB9jk_FDDEhwIsTTdGWhlafiyoIcpb0ySVyJa9uTmFjCXJ7ohEY9dzbh36gUrm1Apx9FuY48BZ_2Qos4SbhHNyIIuJL5PpZL5n-Boxpkm3Y-dA7CYxvmYbvclWMOiwyfRBRk1tksDJE0VNjWd11th-aepnEqBpKdd1kiEjeAzEFbJR1zyRI5obgDDSpCgVXILqmwE_ikdGT8N1bjVOcloHyFj8n2pcr5k-fm9yqHAji3XciLZOdmK4PIqxgVAau-rxviVdYgZ-ejc_f9ht5NdybzERGf1bccvz7Vrx4kaF_8KAoIsSEUckU5wr6rWc25xkAiToKFMug-s0YE6SBJx-GGvyRq8mYWMQBmKXN7gf_X967Uh1okMFXiCjT_OeoPtyAuG1amqEOpBiXMOy5atCvYKxgGpLP68GhVahiiTwfmMYuO93u5jcuiFkEDqKyfTEp8Nzt6_AWOYCjKxkxoln_kalNpA2itptpdTgbdWm-yBAQ-UX6IgqmPXBaq_LIJ9H0MH96GmW1GbXo1xCZtpQcX_4ZHblJQN0qIfcrsAA9YxsS2bPkn8kBEjG05nya5vKcHqabVAJycUdjYJ-Jcv4lffHJehfZ8rO6UDaZMmHwIFtO8MXxDMg8__rsnwoCJf2UjERAgunzm_6fjgk7bsSmx083Wvuz2ThyECubkHAamy4nmbMnHQu2NOmI8tS1-myT7Ax9eh2ktXbgI33mJS28vw2haaBclB56ViG5vjnWZI23w-t9J7MaepWv8MvydCa-FwAUq6LjwTte0AmUpv0QCkBJCpONt2J6wjKoJAzjJX9o9bLMvK_MPoZqwbL2YBlfcqaUBSLxqojgmpM9KcgGHDE0Z2gtKUGLURVLcdNb2kypy4lQFNnYJiBPkQaKbSx-FnTnZSeCwjCXbjLzwMIueYNRQG1DOPiQAEq8ZfOg3FCGXpN0yn-5iqKhxA0VQwP1bkNPSkJUj1z3_E-vc-4fcFHh1-IQmLost-FnQLa394RFbTgsTLI-0-AbsoKgrWPn02y6SAEyqkfPVtOZRc0X4YZvMzDpX-l4exKRHcm3s5N2TL_34VNJYCYrtENnT_ksG32RouayI2ox8L1YBGHcShh4ezSa3hBmV76U70ByU2jzIpfxgI73mql4lf_6U4NH8t9UCCjC2qVm_bPUHuNB6GpdlOr8x0zGqEI6MtHunobQr6zfWDwJfYaMWweCk5HH-wy9J0rEe7g",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": true,
"httpOnly": true,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1754236994,
"storeId": "firefox-default",
"id": 13
},
{
"name": "SAPISID",
"value": "l025IpB7PfQpCIwv/AgXIS-5Ulzd7MUUdB",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": true,
"httpOnly": false,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1800808147,
"storeId": "firefox-default",
"id": 14
},
{
"name": "SID",
"value": "g.a000swjOh6XSGYi19FqGlLMv6ciO98kC_XNfbLONUdm43A1oc1fFcft1TRkY75bBTa0XBuSVAgACgYKAdYSARYSFQHGX2MiMAjs2bPQdC2UfdL8v0IzKRoVAUF8yKoLpJT0D-AbEsIwcO_MyCWn0076",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": false,
"httpOnly": false,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1800808147,
"storeId": "firefox-default",
"id": 15
},
{
"name": "SIDCC",
"value": "AKEyXzUYOUC0XkqwSRKRewqlXoMuGP9Feh_bprmEU-G4tRnFTVL-2EGZb9ROEaJWf9h2tMTHeks",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": false,
"httpOnly": false,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1769965125,
"storeId": "firefox-default",
"id": 16
},
{
"name": "SSID",
"value": "Ay3uB5fTOa7rN6h8V",
"domain": ".google.com",
"hostOnly": false,
"path": "/",
"secure": true,
"httpOnly": true,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1800808147,
"storeId": "firefox-default",
"id": 17
},
{
"name": "_ga",
"value": "GA1.1.1954799460.1737385433",
"domain": ".news.google.com",
"hostOnly": false,
"path": "/",
"secure": false,
"httpOnly": false,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1801501123,
"storeId": "firefox-default",
"id": 18
},
{
"name": "_ga_SYGF1G18MM",
"value": "GS1.1.1738427626.2.1.1738429123.0.0.0",
"domain": ".news.google.com",
"hostOnly": false,
"path": "/",
"secure": false,
"httpOnly": false,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1801501123,
"storeId": "firefox-default",
"id": 19
},
{
"name": "GN_PREF",
"value": "W251bGwsIkNBSVNEQWpYeTdtOEJoRDRxZWJNQWciXQ__",
"domain": "news.google.com",
"hostOnly": true,
"path": "/",
"secure": true,
"httpOnly": false,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1753153431,
"storeId": "firefox-default",
"id": 20
},
{
"name": "OTZ",
"value": "7918024_52_52_123900_48_436380",
"domain": "news.google.com",
"hostOnly": true,
"path": "/",
"secure": true,
"httpOnly": false,
"sameSite": "no_restriction",
"session": false,
"firstPartyDomain": "",
"partitionKey": null,
"expirationDate": 1739977432,
"storeId": "firefox-default",
"id": 21
}
]

40
iacorrector.py Normal file
View File

@ -0,0 +1,40 @@
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# Carga el modelo y el tokenizador (ajusta la ruta si es local)
modelo_nombre = "meta-llama/Llama-3-8B" # O usa un modelo local como "ruta/al/modelo"
tokenizer = AutoTokenizer.from_pretrained(modelo_nombre)
modelo = AutoModelForCausalLM.from_pretrained(modelo_nombre, torch_dtype=torch.float16, device_map="auto")
# Umbral de logits (ajusta según pruebas)
UMBRAL_LOGITS = -1.0
def evaluar_seguridad_nacional(texto):
prompt = f"Evalúa si el siguiente texto está relacionado con defensa nacional, inteligencia, espionaje, fuerzas de seguridad, policía, ejército o fuerzas armadas. Responde solo con '' o 'no'.\n\nTexto: {texto}\n\nRespuesta:"
# Tokenización
inputs = tokenizer(prompt, return_tensors="pt").to(modelo.device)
# Inferencia con el modelo
with torch.no_grad():
outputs = modelo(**inputs)
# Obtener logits del último token generado
logits = outputs.logits[:, -1, :] # Última posición
# Obtener puntuaciones para "sí" y "no"
id_si = tokenizer.convert_tokens_to_ids("")
id_no = tokenizer.convert_tokens_to_ids("no")
logit_si = logits[0, id_si].item() if id_si in tokenizer.get_vocab() else -float("inf")
logit_no = logits[0, id_no].item() if id_no in tokenizer.get_vocab() else -float("inf")
# Decidir según los logits
if logit_si > logit_no and logit_si > UMBRAL_LOGITS:
return True
return False
# Ejemplo de uso
texto_ejemplo = "El ejército ha desplegado unidades en la frontera para proteger la soberanía nacional."
resultado = evaluar_seguridad_nacional(texto_ejemplo)
print(f"¿El texto está relacionado con seguridad nacional? {resultado}")

View File

@ -1 +1 @@
Real Madrid
Defensa

View File

@ -0,0 +1,82 @@
[
{
"title": "Giro de Sánchez en Defensa: envía una carta a Calviño junto a otros 18 líderes europeos para que el BEI eleve la inversión militar - El Mundo",
"content": "<a href=\"https://news.google.com/rss/articles/CBMiekFVX3lxTE9JcGJMVUUzZTY1OFpqRHNTd2xRUzUtTEt1bmotbGtFZjVkdnd3bzBXdU9ienhXWGtVaUhJY2dOajR1RUFIVEhWYTFubVdHNHExVXJ3QUpLcExva0lUbzBvbl9yTUtJWkZjVlluZzRYSUZlVEZaX012VmVR0gF6QVVfeXFMT3RiMTB3dVNseWRGMlVadGkweXowXzN5YTBIMFlqZjNfZXBfalN4REctZGtNZzYtTWp6WVo2VUtHT2NFRTZ2M1NoTWZWeEZkN2w3N3RQV2FuT2lrNHVpcDBlNlVSM2c5UGVxSEhvSXlmSFA0ZEZyYVViQ3c?oc=5\" target=\"_blank\">Giro de Sánchez en Defensa: envía una carta a Calviño junto a otros 18 líderes europeos para que el BEI eleve la inversión militar</a>&nbsp;&nbsp;<font color=\"#6f6f6f\">El Mundo</font>",
"author": "Daniel Viaña",
"newspaper": "El Mundo",
"date": "Fri, 31 Jan 2025 10:46:17 GMT",
"link": "https://www.elmundo.es/espana/2025/01/31/679ca997e85ece2e4e8b45b0.html"
},
{
"title": "Las razones de Dallas para traspasar a Doncic: mejorar la defensa y su “condición física” - La Vanguardia",
"content": "<a href=\"https://news.google.com/rss/articles/CBMi1wFBVV95cUxOeVdhaWs4WW5Pak5ZUmF5ZDIySV9HVEJTT0lnVXg1Z0swZ1ROeGRQZzk5YktraTNFOXQ5WGFSSWx3cUI5Vnk2cUhxWW5NcG9UakJvMEhpQVVUeEtheEJsaEQzRXZESDRDc1lNc0l4WFkzSlRnenI1SlJHVUE2STdnQ0dZNG1vUExadXdJenB1MWdxM2NoNUdwQmRibGFJUnBVRElPVlJET3E4MzljQ29vSEF5N2hJcnBRZjV3Z3NMdmViLVlOMldYbFVuYkROR1lBVU8tbS1Ud9IB3AFBVV95cUxOdDE3OVlZeTF6QloyWEFQaWV0eVpDdzhORkJRY1I3UjBtYXktdE8wVmZHaVlqMlVNYU1lR1pMTnAxRUl4Z2pJRzd1bUU1REV2S0ZEeEJ3MmM4bUVDNHBnMlVHWHlhZjUwVGtSbDFfU1Y0aDAyN2xGVVpPYWxib2N3MVFpVVRZSTNFc3dMbnBwVzI1ZVFhRDdFS29YZ2NGejVQMDZDQXc0MVE1MDhrZjlwVW5uX2ZfT3JpNGE5S3JPTE5iZ2RLU1FUVDVzTVU1S0pRaG1zQjNHU2tTTjBI?oc=5\" target=\"_blank\">Las razones de Dallas para traspasar a Doncic: mejorar la defensa y su “condición física”</a>&nbsp;&nbsp;<font color=\"#6f6f6f\">La Vanguardia</font>",
"author": "Pedro Ruiz",
"newspaper": "La Vanguardia",
"date": "Sun, 02 Feb 2025 11:04:48 GMT",
"link": "https://www.lavanguardia.com/deportes/baloncesto/nba/20250202/10344099/razones-dallas-traspasar-doncic-mejorar-defensa-condicion-fisica.html"
},
{
"title": "Miles de personas se manifiestan en Buenos Aires contra Milei en defensa del colectivo LGTBIQ - El Confidencial",
"content": "<a href=\"https://news.google.com/rss/articles/CBMi5AFBVV95cUxQX1V6RnR0MW5NaFdKMjg1aFpCd2VJNVZ3ZWg3eU1YTVZiWEJGdm1XSk1nRms4QldDU3VISktEd3FpNDVOYUZQMWNmWnk3dHlsc0ExSU94YmpqZ0dKZlRDNlcyX0hzcXotdS14a0YxaWUweWFQQzBsLU80RlRvZXhMbjlXbXVoS2JIZzg2b3ItT0pRT0FjdjRhbzR1MnhnbVBsNWlBa09QZUp5dkx6YXkxODJlTFl3VEs2azk2dnYwaFcxNGJHUFRkUG01U0VZb3U4THVINzVZZUJJZjRtSmRCVWRTcTnSAeoBQVVfeXFMT3gxQkpSNTg3U1lVUE1VYk4zRW9QTV9fR1BsOEZxNmFBZFhfMVVGQ1ZzbU9mRjRvOFhUVTE3MTBfcDJlUmJjNS1kSl8yN3ZWaEdTUXQ1cU95WjVrN0VndVA5bjVremNKOWFPdkNtT3k2Qlh2SXFETGxMYXBCRHQzZVBfMWJFS0Q3UmdnRk5EWXJhWE1TWDlzVk9NazNiQzFOcWRpYXBHbTlhWVNGRFA4ZHJNdjQxRUhIVUdLdEVYVkhYMHVOajFMcHFjVFZaZTZmNmR4eFNtZU1sMm9vTnlNSHBkR3d1c1ZjYmd3?oc=5\" target=\"_blank\">Miles de personas se manifiestan en Buenos Aires contra Milei en defensa del colectivo LGTBIQ</a>&nbsp;&nbsp;<font color=\"#6f6f6f\">El Confidencial</font>",
"author": "Borja Fernández",
"newspaper": "El Confidencial",
"date": "Sun, 02 Feb 2025 11:50:00 GMT",
"link": "https://www.elconfidencial.com/mundo/2025-02-02/miles-de-personas-se-manifiestan-en-buenos-aires-contra-milei-en-defensa-del-colectivo-lgtbiq_4055892/"
},
{
"title": "Defensa revisa partidas para subir el gasto ante la OTAN por la vía rápida - ABC.es",
"content": "<a href=\"https://news.google.com/rss/articles/CBMimgFBVV95cUxNSTU0YWpLQ0czbS1oZUhJWC1zZXo4a2JsWHZYSEluS2h4OVV5bWt1SmFwRFRZZ3dCaUFoeVNKOWZ5cWQyMElmelg0LS1peWl3b04tU2tSWmU1Z3BLblVlcmlqMnd2VThjNnlwaFBXZFNJZWx2ZGlKWXNqb3hOa1hNbEhha2trZURac3NRMXE3WjFaWVctMVZaM1dR0gGfAUFVX3lxTFBfMDJ6RDFpTXJheklMNi1LZHdIMjlSTVIwMzVyZVR6UktTWHZ4WVF1M1ZnNXVXa2ItQzA2Q1haSEN3YUEzM0Nxa3ZvX1k5dWRaYmtsVWdLQUNBS1hlNklXN0x1ZDdqYVNwQy1mMm1peHJKbUQ1VlA5SmZDQjFKRklXekFGWldjWE1FZkxtR0drck9iRmcxd25jTWVTbkZncw?oc=5\" target=\"_blank\">Defensa revisa partidas para subir el gasto ante la OTAN por la vía rápida</a>&nbsp;&nbsp;<font color=\"#6f6f6f\">ABC.es</font>",
"author": "Ana Sánchez",
"newspaper": "ABC.es",
"date": "Sat, 01 Feb 2025 03:38:34 GMT",
"link": "https://www.abc.es/espana/defensa-revisa-partidas-subir-gasto-ante-otan-20250201194511-nt.html"
},
{
"title": "La Defensa de la Trinchera: Inversión en ETFs - esRadio",
"content": "<a href=\"https://news.google.com/rss/articles/CBMiywFBVV95cUxPZkZYYkMwakh0RVhYVkg1S0EtMHRCN0M5a2FFWEtkVXczT3F0Nm9nT0lERk1NRDFxSkI3UUxrWU9nQTRPclV0R3hVdmZVdXFhVlk4a21LZjI0akVRZGh1cmVFelVBV0UxUldtenF1YWV0VkdrZkRyN01IUDdwS1gydTFlTHBTamlRTDFTVEtXNnlNb0p4dFRnQ3ZwYWxnbUJuVXlUWi16YjVBbHdoZE9ZcDV5ZnBBZTN3TzhrVVBSTmRQeDFELVJGQlhUQQ?oc=5\" target=\"_blank\">La Defensa de la Trinchera: Inversión en ETFs</a>&nbsp;&nbsp;<font color=\"#6f6f6f\">esRadio</font>",
"author": "Manuel Llamas",
"newspaper": "esRadio",
"date": "Sun, 02 Feb 2025 10:15:00 GMT",
"link": "https://esradio.libertaddigital.com/fonoteca/2025-02-02/la-defensa-de-la-trinchera-inversion-en-etfs-p7106614-s7107392-7213634.html"
},
{
"title": "La Laguna Tenerife, a por la defensa del cuarto puesto - Diario de Avisos",
"content": "<a href=\"https://news.google.com/rss/articles/CBMioAFBVV95cUxQZDhXNC1fWWNVT3J0anktb0Q5ejNNcFM1OUNEMVNGc2FXb00yVk5SblBYRnJ0VE01Ymg3ckpYZHFhbUJpNkRfSWNVLW9NcU5GQXNKUUlQOVctdHBuTmduTFhYVS1UZURWS2hrRXl2U2UtSGtad0VYbGZxMHFCUG1lU1B6dVNsdGYwVnN0cjdBbjVsNnc3RENmMFNFblUxRGln0gHuAkFVX3lxTE41X3dNRkxoYlN5NVJuWHhtaXlXV1V1MFJwTHJVYmc1UlhXT3JpSUl6LUhFQ3BFSmRRMzY3emJHS19SSVl2OUxDVUhUUktpSU9jeEhyUElURzR3VHpubXNfUmp4M3hrdzRqV2pST3dKTlNrYUJNWlRYbjhscFo1MlMyZWYybS1QdHRTdW04QXo2SVlXbG5zeEpjdm44bXdRZHlSaXpCckZ1T2xZQzUwRG14Y3praEVBUlZERHYzOTd1WklxWGstV21tT05PMFFwMXVaczJPSGNmRUc4RkxydTZ4YXo3cnhyQ0dNc29EZU8zYXJLS3o0cExxWTVMa3I3V0NQVDVleEhSZFEzenVvUEp5Wm83aDlTQ3podWR0aFctTVdyNFQtNzQxRlMxaUlJUHFUdm13TV84Y1lOR1pjVldlcjRsUnJoQXN2THRVX3hIM0pOS1htalFudlpoVU1xNWs2TFBPYXZEbXNpUUV2UQ?oc=5\" target=\"_blank\">La Laguna Tenerife, a por la defensa del cuarto puesto</a>&nbsp;&nbsp;<font color=\"#6f6f6f\">Diario de Avisos</font>",
"author": "Autor no encontrado en los metadatos.",
"newspaper": "Diario de Avisos",
"date": "Sun, 02 Feb 2025 00:15:24 GMT",
"link": "https://diariodeavisos.elespanol.com/2025/02/la-laguna-tenerife-a-por-la-defensa-del-cuarto-puesto/"
},
{
"title": "Defensa niega que exista un brote de sarna entre los soldados en Valencia y reduce los contagios a dos militares - Infobae España",
"content": "<a href=\"https://news.google.com/rss/articles/CBMi6wFBVV95cUxQeXpTSkl6R2VrSVhVSnZKdC1hcXdjdG4wb0xRelVvV0h3M2p6UXlPaDl6dkh6d1RJN3N6WlFsR1lwR19VYVBsakJBR2p6OV9nU1AzZ0YtMk1lc1JmdmpIUG10NUdWOHY1UTh0dVFFVzRiQmVfTEZaYU5qSHV0RkVrTTRybjNsX0tvVlNRbHF5b19Jdzdob0ViMjR1ME0zOUpvekxmWXpScS1GOGZ3bjVvWUIyUllmUWZNTUw2dXRmRUNMYjJ1VGFDc1hJZ2MwYTROMmJvVGZpdXdiWENsM01Ra3dpMDVUMFhfUjZB0gGGAkFVX3lxTE9Zd0RxZW1VVWYzOWJPUEpOdVFEbGVsQ3g2MlhuTjNuSDZVdDhYd2ZOV0V5S25FMnVqUjRxeTVYZnlLejc2blNWM2JiUFdJTWlOQW1LRENlNU95dUFBb0wwc3ZpMTRFbWxlT1VwMlQ1MXpkelpENC02ei02b2l1S3pIcjNrMHVmTjZJU0g5b3B1a0NCdTU2MGdEQWZUSERmSS1wOHN2ajFmVDhTVkpkbDNWbVhKT28ya09XclltYkFlUkxoYTR1MERnRnRPMHVsbWJiZ1lGcThVYXBBMmZ3cE5kNWVDNktRakJSWnlLZ3BmWlYtc19PaU9CTFk4ZHpaZTRDa3R4YlE?oc=5\" target=\"_blank\">Defensa niega que exista un brote de sarna entre los soldados en Valencia y reduce los contagios a dos militares</a>&nbsp;&nbsp;<font color=\"#6f6f6f\">Infobae España</font>",
"author": "Gastón Trelles",
"newspaper": "Infobae España",
"date": "Sat, 01 Feb 2025 12:34:00 GMT",
"link": "https://www.infobae.com/espana/2025/02/01/defensa-niega-que-exista-un-brote-de-sarna-entre-los-soldados-en-valencia-y-reduce-los-contagios-a-dos-militares/"
},
{
"title": "Ucrania rompe la defensa aérea de Putin con un triple ataque - El HuffPost",
"content": "<a href=\"https://news.google.com/rss/articles/CBMikwFBVV95cUxPSlJ6TDF2N3R6WE5WeVZ3aHgxQU9ZM1U4TVVmMjdpTHRWVk5jTU5QSF9YcGNNdVFlaXVlRFAxUS1SeUtFb1hFS1B0ZFFEUXA4dkJaN0ZUWEFvd0FFME9LRzNCOWt0enNzc2lEZ1k3OGVlQmJsWTFZS0pyenpCcEZ1b3EwOHhKV3I4U2dRMmR0WThmZknSAacBQVVfeXFMT0V5T2FrNjRtNGNJSU9LR0d1SDNKeDFpOGpFTzBHTl9XeDU2VUZ2VXlZQy12UFk2MW9tZW5DOU9YdWFSVk1HM3R0M2FaSDQwZWRGT2h6Y3lYdlFXdERnQ183c1pWcUlxRzdEM3g4b2tZQ1hGdWF4MVlqaHZBbWd2NzNWWG1TbUhONnF1X293LUtpdGM4a092VkgzQThmaHZVdHRITnBxOFk?oc=5\" target=\"_blank\">Ucrania rompe la defensa aérea de Putin con un triple ataque</a>&nbsp;&nbsp;<font color=\"#6f6f6f\">El HuffPost</font>",
"author": "Andrea Cadenas de Llano Sosa",
"newspaper": "El HuffPost",
"date": "Fri, 31 Jan 2025 16:55:20 GMT",
"link": "https://www.huffingtonpost.es/global/ucrania-rompe-defensaerea-putin-triple-ataquebr.html"
},
{
"title": "La vanidad del defensa central - 20minutos.es",
"content": "<a href=\"https://news.google.com/rss/articles/CBMidkFVX3lxTFBtS1FzQ3h1aTZGdm4xMk9jTy0yZVJaV1V0VzNLUTlqWkh2U3RFWUZMMXZxZVNKNmUxX2Q3U0FuWTFjSkRlMnRJREJ6M3hFZk1COFpCOENPb2FZNkNvUmw2bkNnSENxazF3NDB0dVZ3MjFLOVBvSkE?oc=5\" target=\"_blank\">La vanidad del defensa central</a>&nbsp;&nbsp;<font color=\"#6f6f6f\">20minutos.es</font>",
"author": "Juan Luis Saldaña",
"newspaper": "20minutos.es",
"date": "Sat, 01 Feb 2025 05:45:00 GMT",
"link": "https://www.20minutos.es/noticia/5678058/0/vanidad-defensa-central/"
},
{
"title": "Arnau Comas, refuerzo invernal para la defensa de la SD Eibar - SD Eibar",
"content": "<a href=\"https://news.google.com/rss/articles/CBMimAFBVV95cUxQUHRmU3FaYlR6Ri1BMEVra3Ayb2RxUGlKV1psV19pSFNhY0gwd0RFZ0NpUUlVWUVLX1FaTFpENWlWcTA4N2ZIMEJiRFhCa0xLT2RmWWF3YnBWczdnaDV0MVMtMEtyTFBaNm14Y3RNY0VjdjFlNFFrY3hCSEZnTE5VUjZuMVdzWE0yQUFJN2pQb0J6WHBhaUtKaQ?oc=5\" target=\"_blank\">Arnau Comas, refuerzo invernal para la defensa de la SD Eibar</a>&nbsp;&nbsp;<font color=\"#6f6f6f\">SD Eibar</font>",
"author": "Autor no encontrado en los metadatos.",
"newspaper": "SD Eibar",
"date": "Fri, 31 Jan 2025 16:19:11 GMT",
"link": "https://www.sdeibar.com/noticias/arnau-comas-refuerzo-invernal-para-la-defensa-de-la-sd-eibar"
}
]

1169
pagina_web_sin_js.html Normal file

File diff suppressed because one or more lines are too long

View File

@ -3,11 +3,48 @@ from bs4 import BeautifulSoup
import json
import os
import time
import subprocess
from googlenewsdecoder import gnewsdecoder
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
def get_final_url(url):
"""
Sigue el enlace para obtener la URL final después de posibles redirecciones.
"""
try:
response = requests.get(url, headers=HEADERS, allow_redirects=True)
return response.url
except requests.RequestException as e:
print(f"Error al seguir el enlace: {e}")
return url # En caso de error, devolvemos el enlace original
def get_author_from_script(url):
"""
Llama a autorsearcher.py con la URL de la noticia y devuelve el autor encontrado.
"""
try:
result = subprocess.run(["python", "autorsearcher.py", url], capture_output=True, text=True)
author = result.stdout.strip()
return author if author else "Desconocido"
except Exception as e:
print(f"Error al obtener el autor para {url}: {e}")
return "Desconocido"
def get_url_from_google_news(url):
interval_time = 1
try:
decoded_url = gnewsdecoder(url, interval=interval_time)
if decoded_url.get("status"):
return decoded_url["decoded_url"]
else:
return "N/C"
except Exception as e:
print(f"Error occurred: {e}")
def search_news(query):
"""
Busca noticias relacionadas con una palabra clave en Google News.
@ -19,12 +56,11 @@ def search_news(query):
print(f"Error al acceder a la página para la consulta '{query}': {response.status_code}")
return []
# Analizar el RSS como XML
soup = BeautifulSoup(response.content, 'xml') # Cambié 'html.parser' por 'xml'
articles = soup.find_all("item") # Los artículos están dentro de etiquetas <item> en RSS
soup = BeautifulSoup(response.content, 'xml')
articles = soup.find_all("item")
news_list = []
for article in articles[:10]: # Limitar a las 10 primeras noticias
for article in articles[:10]: # Limitar a los primeros 10 artículos
try:
title = article.title.get_text(strip=True)
content = article.description.get_text(strip=True) if article.description else "Sin descripción"
@ -32,13 +68,20 @@ def search_news(query):
source_info = article.source.get_text(strip=True) if article.source else "Desconocido"
date = article.pubDate.get_text(strip=True) if article.pubDate else "Fecha no disponible"
# Obtener la URL final del artículo
final_url = get_url_from_google_news(link)
# Obtener el autor usando autorsearcher.py
author = get_author_from_script(final_url)
news_item = {
"title": title,
"content": content,
"author": "Desconocido", # Autor no disponible en esta consulta
"author": author,
"newspaper": source_info,
"date": date,
"link": link
"link": final_url # Guardamos la URL final en lugar de la de Google News
}
news_list.append(news_item)
@ -78,4 +121,3 @@ def search_from_keywords_file():
# Ejecutar la búsqueda desde el archivo
search_from_keywords_file()