encontrado autor y link
This commit is contained in:
136
autorsearcher.py
136
autorsearcher.py
@ -1,75 +1,93 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import requests
|
||||
|
||||
def get_html_without_javascript(url):
|
||||
import sys
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def download_html_as_human(url):
|
||||
"""
|
||||
Utiliza Selenium para obtener el HTML de una página web con JavaScript desactivado.
|
||||
Descarga el HTML de una página web simulando un navegador real y usando cookies de sesión.
|
||||
"""
|
||||
# Configuración de opciones de Chrome
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--disable-dev-shm-usage") # Usa memoria compartida si no hay suficiente RAM
|
||||
chrome_options.add_argument("--no-sandbox") # Desactiva el sandboxing (problema común en entornos sin root)
|
||||
chrome_options.add_argument("--disable-extensions") # Desactiva extensiones del navegador
|
||||
chrome_options.add_argument("--remote-debugging-port=9222") # Debugging remoto
|
||||
chrome_options.add_argument("--headless=new") # Nueva implementación de headless
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
session = requests.Session()
|
||||
|
||||
# Inicializar el driver de Chrome
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
response = session.get(url, headers=headers)
|
||||
|
||||
try:
|
||||
# Accede al sitio web
|
||||
driver.get(url)
|
||||
# Obtener el contenido de la página
|
||||
html_content = driver.page_source
|
||||
return html_content
|
||||
finally:
|
||||
# Cierra el navegador
|
||||
driver.quit()
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
else:
|
||||
return None
|
||||
|
||||
def get_author_from_html(html_content):
|
||||
def extract_author_from_json(json_data):
|
||||
"""
|
||||
Utiliza BeautifulSoup para extraer el autor de una página HTML.
|
||||
Extrae el autor del JSON-LD, incluso si está en una lista.
|
||||
"""
|
||||
try:
|
||||
# Analizar el HTML con BeautifulSoup
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
if isinstance(json_data, list):
|
||||
for item in json_data:
|
||||
author = extract_author_from_json(item)
|
||||
if author:
|
||||
return author
|
||||
elif isinstance(json_data, dict):
|
||||
if 'author' in json_data:
|
||||
author_data = json_data['author']
|
||||
if isinstance(author_data, list):
|
||||
for author in author_data:
|
||||
if isinstance(author, dict) and 'name' in author:
|
||||
return author['name']
|
||||
elif isinstance(author_data, dict) and 'name' in author_data:
|
||||
return author_data['name']
|
||||
return None
|
||||
|
||||
# Buscar la meta tag con name="autor"
|
||||
author_meta = soup.find('meta', attrs={'name': 'autor'})
|
||||
|
||||
# Si existe, devolver el contenido del atributo "content"
|
||||
if author_meta and 'content' in author_meta.attrs:
|
||||
return author_meta['content']
|
||||
else:
|
||||
return "No se encontró el autor en la meta etiqueta."
|
||||
except Exception as e:
|
||||
return f"Error al analizar el HTML: {e}"
|
||||
|
||||
# Función principal
|
||||
def main(url):
|
||||
def get_author_from_json_ld(soup):
|
||||
"""
|
||||
Combina ambas funciones para obtener el autor desde una página con JS desactivado.
|
||||
Extrae el autor de los metadatos JSON-LD, considerando estructuras con listas y objetos.
|
||||
"""
|
||||
try:
|
||||
# Obtener el HTML sin JavaScript usando Selenium
|
||||
print("Obteniendo HTML con JavaScript desactivado...")
|
||||
html_content = get_html_without_javascript(url)
|
||||
scripts = soup.find_all('script', type='application/ld+json')
|
||||
for script in scripts:
|
||||
try:
|
||||
json_data = json.loads(script.string)
|
||||
author = extract_author_from_json(json_data)
|
||||
if author:
|
||||
return author
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
return None
|
||||
|
||||
# Guardar el HTML en un archivo local (opcional)
|
||||
with open("pagina_web_sin_js.html", "w", encoding="utf-8") as file:
|
||||
file.write(html_content)
|
||||
def get_author_from_meta(soup):
|
||||
"""
|
||||
Extrae el autor de la etiqueta <meta> con el atributo property="nrbi:authors".
|
||||
"""
|
||||
meta_author = soup.find('meta', property='nrbi:authors')
|
||||
if meta_author and 'content' in meta_author.attrs:
|
||||
return meta_author['content']
|
||||
return None
|
||||
|
||||
# Obtener el autor desde el HTML
|
||||
print("Analizando HTML para obtener el autor...")
|
||||
author = get_author_from_html(html_content)
|
||||
def get_author_from_url(url):
|
||||
"""
|
||||
Busca el autor en los metadatos JSON-LD y en la etiqueta <meta> de una URL.
|
||||
"""
|
||||
html_content = download_html_as_human(url)
|
||||
if not html_content:
|
||||
print("error")
|
||||
return "No se pudo descargar la página."
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
author = get_author_from_json_ld(soup)
|
||||
if author:
|
||||
return author
|
||||
except Exception as e:
|
||||
return f"Error general: {e}"
|
||||
|
||||
# Ejemplo de uso
|
||||
url = "https://www.abc.es/internacional/trump-presidente-vez-era-dorada-empieza-momento-20250120202851-nt.html"
|
||||
author = main(url)
|
||||
print(f"El autor es: {author}")
|
||||
author = get_author_from_meta(soup)
|
||||
if author:
|
||||
return author
|
||||
|
||||
return "Autor no encontrado en los metadatos."
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1:
|
||||
url = sys.argv[1]
|
||||
print(get_author_from_url(url))
|
||||
else:
|
||||
print("Uso: python autorsearcher.py <URL>")
|
||||
|
338
cookies.txt
Normal file
338
cookies.txt
Normal file
@ -0,0 +1,338 @@
|
||||
[
|
||||
{
|
||||
"name": "__Secure-1PAPISID",
|
||||
"value": "l025IpB7PfQpCIwv/AgXIS-5Ulzd7MUUdB",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": false,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1800808147,
|
||||
"storeId": "firefox-default",
|
||||
"id": 1
|
||||
},
|
||||
{
|
||||
"name": "__Secure-1PSID",
|
||||
"value": "g.a000swjOh6XSGYi19FqGlLMv6ciO98kC_XNfbLONUdm43A1oc1fFTND9FvfSmXKvG7EhWgJ4hgACgYKAZgSARYSFQHGX2MiLo_4je-xWwmTLnlXP5Va_RoVAUF8yKrKCtOBz3FTPT0d-p0dW4vu0076",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": true,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1800808147,
|
||||
"storeId": "firefox-default",
|
||||
"id": 2
|
||||
},
|
||||
{
|
||||
"name": "__Secure-1PSIDCC",
|
||||
"value": "AKEyXzUd4l-NSvsadu5JrPxgjJm6PxbCCl2-3OgSwJ5HG3aDrfBJW22PV74GirgyFJgYVSu7mw",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": true,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1769965125,
|
||||
"storeId": "firefox-default",
|
||||
"id": 3
|
||||
},
|
||||
{
|
||||
"name": "__Secure-1PSIDTS",
|
||||
"value": "sidts-CjEBmiPuTRfT43VJY05lEAkfcUHZ3VCWX9aRHEdwZxtXc4LtOH2h0Aq7oxhCy8rqyo10EAA",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": true,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1769965123,
|
||||
"storeId": "firefox-default",
|
||||
"id": 4
|
||||
},
|
||||
{
|
||||
"name": "__Secure-3PAPISID",
|
||||
"value": "l025IpB7PfQpCIwv/AgXIS-5Ulzd7MUUdB",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": false,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1800808147,
|
||||
"storeId": "firefox-default",
|
||||
"id": 5
|
||||
},
|
||||
{
|
||||
"name": "__Secure-3PSID",
|
||||
"value": "g.a000swjOh6XSGYi19FqGlLMv6ciO98kC_XNfbLONUdm43A1oc1fFCkMoPKmrDiRPcWjHSIbRNAACgYKAUcSARYSFQHGX2MivW7fEv6cBOgJFPhPcV6qKxoVAUF8yKpKHw_E6y4Avtii2PoRIyw20076",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": true,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1800808147,
|
||||
"storeId": "firefox-default",
|
||||
"id": 6
|
||||
},
|
||||
{
|
||||
"name": "__Secure-3PSIDCC",
|
||||
"value": "AKEyXzUgCPoQ0O0uww4uLiIAjVvZ8_SNjJEe-NuYvtLsnb9ETZaJyZtTroaByYu-BtNKc62jEfg",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": true,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1769965125,
|
||||
"storeId": "firefox-default",
|
||||
"id": 7
|
||||
},
|
||||
{
|
||||
"name": "__Secure-3PSIDTS",
|
||||
"value": "sidts-CjEBmiPuTRfT43VJY05lEAkfcUHZ3VCWX9aRHEdwZxtXc4LtOH2h0Aq7oxhCy8rqyo10EAA",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": true,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1769965123,
|
||||
"storeId": "firefox-default",
|
||||
"id": 8
|
||||
},
|
||||
{
|
||||
"name": "__Secure-ENID",
|
||||
"value": "25.SE=D8_8M9M1VzIRiiZUPSaP4JRb9ppYIQ82KAkJKDUKm7ILjfL-sCTzSniTZJsk6wdgKHys832yLnDxHPdNrtukJqHsJxhc8QAEEHxuf4xz_T3N3YQkMd72sI6vFpU_wFCUyl0rk_OrHf5qB9aIPJEIC7LmjFYYLl4hM4GaC6in_lNLbW6xKBeU2YbkVW66RJDrdC05LltVQBy98rWCTbf1aBKhB75qYUftGtlj3BoGtDsCKdVLXYi-jaQ9j5MetSVC1a0yr58duP9CbRVl8euY5Vga4FiUd6HiqsunYCzn_KgpxK1WuzapPvQ3joXE_4vIx6ebOs2X3I8V4CJ8wFzeygy-MJnCxBYb1rGyuxO4XtQwsEMa8DrJfbx9K69KS6ctzEck51EneWMmspaIbiH5W3S3IX9RUkghqybZ_6y7vWJQNoiQjwx4SeJjOnhy5bIyvXGDykSd8f4McpjGA4qwuFYwGhIF1pyD1A",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": true,
|
||||
"sameSite": "lax",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1769946337,
|
||||
"storeId": "firefox-default",
|
||||
"id": 9
|
||||
},
|
||||
{
|
||||
"name": "AEC",
|
||||
"value": "AVcja2el4JQqNhibDkfGyyOdb6_-7RkRMRgq4UAUbFz3ML6a0oPGNEG4Hjc",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": true,
|
||||
"sameSite": "lax",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1747499754,
|
||||
"storeId": "firefox-default",
|
||||
"id": 10
|
||||
},
|
||||
{
|
||||
"name": "APISID",
|
||||
"value": "ISowM3NHT-yoAbMx/AIpIkP3Ni3zbhd6-S",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": false,
|
||||
"httpOnly": false,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1800808147,
|
||||
"storeId": "firefox-default",
|
||||
"id": 11
|
||||
},
|
||||
{
|
||||
"name": "HSID",
|
||||
"value": "Ao9sW4GVwyC96YdSc",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": false,
|
||||
"httpOnly": true,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1800808147,
|
||||
"storeId": "firefox-default",
|
||||
"id": 12
|
||||
},
|
||||
{
|
||||
"name": "NID",
|
||||
"value": "521=oXspz8mdQQUdk-0i_wzC3i-ZcHMpMECXF-T9zqxjTtRr0SOWciPctQomIqc1tX4OVCd1rMbFZtwuhDuLUzBchZhij6ho5N4iq79YYH39NvhW4014kyGmX2so1ewvALHC7lyWD-Qtb-Wws910_w5llt0hu4_3uoeifnRnmy01gfV_5hCBYyR7tCCtN4cVW3eXE7NeWkMAsfqcw37IdLJjHJWb5IwRrw7dhBKvo0OhgHYaSgkSJFLmvXKaWnze0_6t2-BbwslSmcZFNPA5CJgJGSkc20n1n-zD-hITrO4xCCYcJUE6nD-mjogVhgY3hagAoYRG6E15qOjWLMDoZORqgy9qi6ADsPe0Ebz3w_jWzHvMR9gFNSxkkYzgvLY75KGRX7W0pYjc5SNXecottZRcBYtEvHvtYUA4sfPUW24vDpHbDJFcf7SqScx3cII-i8two4RWqpzyXiN1hkW5GrdFzZ9_rCPa1QwTOYSrQ1RyZO8A7PZD8-e9b7UBZDrjqYhOfUZlx2qfb0lbUy2rl6SB-f376nEZPMHg_P7PYN2jKmgbcsooRenosPt4de8nqj9lgoF4CKYgBwGV3GGbK6_qA1jsfOzSoWyflO1lLmZSVRKeS9hdDjlVyvDizsuvio7lKvb4ukKDh9BXz2T8YHBK4vq96jBc0MU13FjxFRxed9eCRPZLKmMVBvZeG86yg-_wv2f4jxn0FL4SXgMJhTrdZ5YhEeV8jeDI_Yiw63w3YgRBy-D96jxpGPDFG0RKzxIYSfQmtfsTXiHbK5kzcqwnGkG9gxSuDOzU66etxEKrheQf9es0YY1nYgJMNMDcyLHI0rEB6PQIyTPRQ0NnLibJ_jOzfcTtwV1uLzF0ybc-ay6dtTFVk3Whv0xnds6KhHg7IPfB9jk_FDDEhwIsTTdGWhlafiyoIcpb0ySVyJa9uTmFjCXJ7ohEY9dzbh36gUrm1Apx9FuY48BZ_2Qos4SbhHNyIIuJL5PpZL5n-Boxpkm3Y-dA7CYxvmYbvclWMOiwyfRBRk1tksDJE0VNjWd11th-aepnEqBpKdd1kiEjeAzEFbJR1zyRI5obgDDSpCgVXILqmwE_ikdGT8N1bjVOcloHyFj8n2pcr5k-fm9yqHAji3XciLZOdmK4PIqxgVAau-rxviVdYgZ-ejc_f9ht5NdybzERGf1bccvz7Vrx4kaF_8KAoIsSEUckU5wr6rWc25xkAiToKFMug-s0YE6SBJx-GGvyRq8mYWMQBmKXN7gf_X967Uh1okMFXiCjT_OeoPtyAuG1amqEOpBiXMOy5atCvYKxgGpLP68GhVahiiTwfmMYuO93u5jcuiFkEDqKyfTEp8Nzt6_AWOYCjKxkxoln_kalNpA2itptpdTgbdWm-yBAQ-UX6IgqmPXBaq_LIJ9H0MH96GmW1GbXo1xCZtpQcX_4ZHblJQN0qIfcrsAA9YxsS2bPkn8kBEjG05nya5vKcHqabVAJycUdjYJ-Jcv4lffHJehfZ8rO6UDaZMmHwIFtO8MXxDMg8__rsnwoCJf2UjERAgunzm_6fjgk7bsSmx083Wvuz2ThyECubkHAamy4nmbMnHQu2NOmI8tS1-myT7Ax9eh2ktXbgI33mJS28vw2haaBclB56ViG5vjnWZI23w-t9J7MaepWv8MvydCa-FwAUq6LjwTte0AmUpv0QCkBJCpONt2J6wjKoJAzjJX9o9bLMvK_MPoZqwbL2YBlfcqaUBSLxqojgmpM9KcgGHDE0Z2gtKUGLURVLcdNb2kypy4lQFNnYJiBPkQaKbSx-FnTnZSeCwjCXbjLzwMIueYNRQG1DOPiQAEq8ZfOg3FCGXpN0yn-5iqKhxA0VQwP1bkNPSkJUj1z3_E-vc-4fcFHh1-IQmLost-FnQLa394RFbTgsTLI-0-AbsoKgrWPn02y6SAEyqkfPVtOZRc0X4YZvMzDpX-l4exKRHcm3s5N2TL_34VNJYCYrtENnT_ksG32RouayI2ox8L1YBGHcShh4ezSa3hBmV76U70ByU2jzIpfxgI73mql4lf_6U4NH8t9UCCjC2qVm_bPUHuNB6GpdlOr8x0zGqEI6MtHunobQr6zfWDwJfYaMWweCk5HH-wy9J0rEe7g",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": true,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1754236994,
|
||||
"storeId": "firefox-default",
|
||||
"id": 13
|
||||
},
|
||||
{
|
||||
"name": "SAPISID",
|
||||
"value": "l025IpB7PfQpCIwv/AgXIS-5Ulzd7MUUdB",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": false,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1800808147,
|
||||
"storeId": "firefox-default",
|
||||
"id": 14
|
||||
},
|
||||
{
|
||||
"name": "SID",
|
||||
"value": "g.a000swjOh6XSGYi19FqGlLMv6ciO98kC_XNfbLONUdm43A1oc1fFcft1TRkY75bBTa0XBuSVAgACgYKAdYSARYSFQHGX2MiMAjs2bPQdC2UfdL8v0IzKRoVAUF8yKoLpJT0D-AbEsIwcO_MyCWn0076",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": false,
|
||||
"httpOnly": false,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1800808147,
|
||||
"storeId": "firefox-default",
|
||||
"id": 15
|
||||
},
|
||||
{
|
||||
"name": "SIDCC",
|
||||
"value": "AKEyXzUYOUC0XkqwSRKRewqlXoMuGP9Feh_bprmEU-G4tRnFTVL-2EGZb9ROEaJWf9h2tMTHeks",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": false,
|
||||
"httpOnly": false,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1769965125,
|
||||
"storeId": "firefox-default",
|
||||
"id": 16
|
||||
},
|
||||
{
|
||||
"name": "SSID",
|
||||
"value": "Ay3uB5fTOa7rN6h8V",
|
||||
"domain": ".google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": true,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1800808147,
|
||||
"storeId": "firefox-default",
|
||||
"id": 17
|
||||
},
|
||||
{
|
||||
"name": "_ga",
|
||||
"value": "GA1.1.1954799460.1737385433",
|
||||
"domain": ".news.google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": false,
|
||||
"httpOnly": false,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1801501123,
|
||||
"storeId": "firefox-default",
|
||||
"id": 18
|
||||
},
|
||||
{
|
||||
"name": "_ga_SYGF1G18MM",
|
||||
"value": "GS1.1.1738427626.2.1.1738429123.0.0.0",
|
||||
"domain": ".news.google.com",
|
||||
"hostOnly": false,
|
||||
"path": "/",
|
||||
"secure": false,
|
||||
"httpOnly": false,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1801501123,
|
||||
"storeId": "firefox-default",
|
||||
"id": 19
|
||||
},
|
||||
{
|
||||
"name": "GN_PREF",
|
||||
"value": "W251bGwsIkNBSVNEQWpYeTdtOEJoRDRxZWJNQWciXQ__",
|
||||
"domain": "news.google.com",
|
||||
"hostOnly": true,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": false,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1753153431,
|
||||
"storeId": "firefox-default",
|
||||
"id": 20
|
||||
},
|
||||
{
|
||||
"name": "OTZ",
|
||||
"value": "7918024_52_52_123900_48_436380",
|
||||
"domain": "news.google.com",
|
||||
"hostOnly": true,
|
||||
"path": "/",
|
||||
"secure": true,
|
||||
"httpOnly": false,
|
||||
"sameSite": "no_restriction",
|
||||
"session": false,
|
||||
"firstPartyDomain": "",
|
||||
"partitionKey": null,
|
||||
"expirationDate": 1739977432,
|
||||
"storeId": "firefox-default",
|
||||
"id": 21
|
||||
}
|
||||
]
|
40
iacorrector.py
Normal file
40
iacorrector.py
Normal file
@ -0,0 +1,40 @@
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
# Carga el modelo y el tokenizador (ajusta la ruta si es local)
|
||||
modelo_nombre = "meta-llama/Llama-3-8B" # O usa un modelo local como "ruta/al/modelo"
|
||||
tokenizer = AutoTokenizer.from_pretrained(modelo_nombre)
|
||||
modelo = AutoModelForCausalLM.from_pretrained(modelo_nombre, torch_dtype=torch.float16, device_map="auto")
|
||||
|
||||
# Umbral de logits (ajusta según pruebas)
|
||||
UMBRAL_LOGITS = -1.0
|
||||
|
||||
def evaluar_seguridad_nacional(texto):
|
||||
prompt = f"Evalúa si el siguiente texto está relacionado con defensa nacional, inteligencia, espionaje, fuerzas de seguridad, policía, ejército o fuerzas armadas. Responde solo con 'sí' o 'no'.\n\nTexto: {texto}\n\nRespuesta:"
|
||||
|
||||
# Tokenización
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(modelo.device)
|
||||
|
||||
# Inferencia con el modelo
|
||||
with torch.no_grad():
|
||||
outputs = modelo(**inputs)
|
||||
|
||||
# Obtener logits del último token generado
|
||||
logits = outputs.logits[:, -1, :] # Última posición
|
||||
|
||||
# Obtener puntuaciones para "sí" y "no"
|
||||
id_si = tokenizer.convert_tokens_to_ids("sí")
|
||||
id_no = tokenizer.convert_tokens_to_ids("no")
|
||||
|
||||
logit_si = logits[0, id_si].item() if id_si in tokenizer.get_vocab() else -float("inf")
|
||||
logit_no = logits[0, id_no].item() if id_no in tokenizer.get_vocab() else -float("inf")
|
||||
|
||||
# Decidir según los logits
|
||||
if logit_si > logit_no and logit_si > UMBRAL_LOGITS:
|
||||
return True
|
||||
return False
|
||||
|
||||
# Ejemplo de uso
|
||||
texto_ejemplo = "El ejército ha desplegado unidades en la frontera para proteger la soberanía nacional."
|
||||
resultado = evaluar_seguridad_nacional(texto_ejemplo)
|
||||
print(f"¿El texto está relacionado con seguridad nacional? {resultado}")
|
@ -1 +1 @@
|
||||
Real Madrid
|
||||
Defensa
|
@ -0,0 +1,82 @@
|
||||
[
|
||||
{
|
||||
"title": "Giro de Sánchez en Defensa: envía una carta a Calviño junto a otros 18 líderes europeos para que el BEI eleve la inversión militar - El Mundo",
|
||||
"content": "<a href=\"https://news.google.com/rss/articles/CBMiekFVX3lxTE9JcGJMVUUzZTY1OFpqRHNTd2xRUzUtTEt1bmotbGtFZjVkdnd3bzBXdU9ienhXWGtVaUhJY2dOajR1RUFIVEhWYTFubVdHNHExVXJ3QUpLcExva0lUbzBvbl9yTUtJWkZjVlluZzRYSUZlVEZaX012VmVR0gF6QVVfeXFMT3RiMTB3dVNseWRGMlVadGkweXowXzN5YTBIMFlqZjNfZXBfalN4REctZGtNZzYtTWp6WVo2VUtHT2NFRTZ2M1NoTWZWeEZkN2w3N3RQV2FuT2lrNHVpcDBlNlVSM2c5UGVxSEhvSXlmSFA0ZEZyYVViQ3c?oc=5\" target=\"_blank\">Giro de Sánchez en Defensa: envía una carta a Calviño junto a otros 18 líderes europeos para que el BEI eleve la inversión militar</a> <font color=\"#6f6f6f\">El Mundo</font>",
|
||||
"author": "Daniel Viaña",
|
||||
"newspaper": "El Mundo",
|
||||
"date": "Fri, 31 Jan 2025 10:46:17 GMT",
|
||||
"link": "https://www.elmundo.es/espana/2025/01/31/679ca997e85ece2e4e8b45b0.html"
|
||||
},
|
||||
{
|
||||
"title": "Las razones de Dallas para traspasar a Doncic: mejorar la defensa y su “condición física” - La Vanguardia",
|
||||
"content": "<a href=\"https://news.google.com/rss/articles/CBMi1wFBVV95cUxOeVdhaWs4WW5Pak5ZUmF5ZDIySV9HVEJTT0lnVXg1Z0swZ1ROeGRQZzk5YktraTNFOXQ5WGFSSWx3cUI5Vnk2cUhxWW5NcG9UakJvMEhpQVVUeEtheEJsaEQzRXZESDRDc1lNc0l4WFkzSlRnenI1SlJHVUE2STdnQ0dZNG1vUExadXdJenB1MWdxM2NoNUdwQmRibGFJUnBVRElPVlJET3E4MzljQ29vSEF5N2hJcnBRZjV3Z3NMdmViLVlOMldYbFVuYkROR1lBVU8tbS1Ud9IB3AFBVV95cUxOdDE3OVlZeTF6QloyWEFQaWV0eVpDdzhORkJRY1I3UjBtYXktdE8wVmZHaVlqMlVNYU1lR1pMTnAxRUl4Z2pJRzd1bUU1REV2S0ZEeEJ3MmM4bUVDNHBnMlVHWHlhZjUwVGtSbDFfU1Y0aDAyN2xGVVpPYWxib2N3MVFpVVRZSTNFc3dMbnBwVzI1ZVFhRDdFS29YZ2NGejVQMDZDQXc0MVE1MDhrZjlwVW5uX2ZfT3JpNGE5S3JPTE5iZ2RLU1FUVDVzTVU1S0pRaG1zQjNHU2tTTjBI?oc=5\" target=\"_blank\">Las razones de Dallas para traspasar a Doncic: mejorar la defensa y su “condición física”</a> <font color=\"#6f6f6f\">La Vanguardia</font>",
|
||||
"author": "Pedro Ruiz",
|
||||
"newspaper": "La Vanguardia",
|
||||
"date": "Sun, 02 Feb 2025 11:04:48 GMT",
|
||||
"link": "https://www.lavanguardia.com/deportes/baloncesto/nba/20250202/10344099/razones-dallas-traspasar-doncic-mejorar-defensa-condicion-fisica.html"
|
||||
},
|
||||
{
|
||||
"title": "Miles de personas se manifiestan en Buenos Aires contra Milei en defensa del colectivo LGTBIQ - El Confidencial",
|
||||
"content": "<a href=\"https://news.google.com/rss/articles/CBMi5AFBVV95cUxQX1V6RnR0MW5NaFdKMjg1aFpCd2VJNVZ3ZWg3eU1YTVZiWEJGdm1XSk1nRms4QldDU3VISktEd3FpNDVOYUZQMWNmWnk3dHlsc0ExSU94YmpqZ0dKZlRDNlcyX0hzcXotdS14a0YxaWUweWFQQzBsLU80RlRvZXhMbjlXbXVoS2JIZzg2b3ItT0pRT0FjdjRhbzR1MnhnbVBsNWlBa09QZUp5dkx6YXkxODJlTFl3VEs2azk2dnYwaFcxNGJHUFRkUG01U0VZb3U4THVINzVZZUJJZjRtSmRCVWRTcTnSAeoBQVVfeXFMT3gxQkpSNTg3U1lVUE1VYk4zRW9QTV9fR1BsOEZxNmFBZFhfMVVGQ1ZzbU9mRjRvOFhUVTE3MTBfcDJlUmJjNS1kSl8yN3ZWaEdTUXQ1cU95WjVrN0VndVA5bjVremNKOWFPdkNtT3k2Qlh2SXFETGxMYXBCRHQzZVBfMWJFS0Q3UmdnRk5EWXJhWE1TWDlzVk9NazNiQzFOcWRpYXBHbTlhWVNGRFA4ZHJNdjQxRUhIVUdLdEVYVkhYMHVOajFMcHFjVFZaZTZmNmR4eFNtZU1sMm9vTnlNSHBkR3d1c1ZjYmd3?oc=5\" target=\"_blank\">Miles de personas se manifiestan en Buenos Aires contra Milei en defensa del colectivo LGTBIQ</a> <font color=\"#6f6f6f\">El Confidencial</font>",
|
||||
"author": "Borja Fernández",
|
||||
"newspaper": "El Confidencial",
|
||||
"date": "Sun, 02 Feb 2025 11:50:00 GMT",
|
||||
"link": "https://www.elconfidencial.com/mundo/2025-02-02/miles-de-personas-se-manifiestan-en-buenos-aires-contra-milei-en-defensa-del-colectivo-lgtbiq_4055892/"
|
||||
},
|
||||
{
|
||||
"title": "Defensa revisa partidas para subir el gasto ante la OTAN por la vía rápida - ABC.es",
|
||||
"content": "<a href=\"https://news.google.com/rss/articles/CBMimgFBVV95cUxNSTU0YWpLQ0czbS1oZUhJWC1zZXo4a2JsWHZYSEluS2h4OVV5bWt1SmFwRFRZZ3dCaUFoeVNKOWZ5cWQyMElmelg0LS1peWl3b04tU2tSWmU1Z3BLblVlcmlqMnd2VThjNnlwaFBXZFNJZWx2ZGlKWXNqb3hOa1hNbEhha2trZURac3NRMXE3WjFaWVctMVZaM1dR0gGfAUFVX3lxTFBfMDJ6RDFpTXJheklMNi1LZHdIMjlSTVIwMzVyZVR6UktTWHZ4WVF1M1ZnNXVXa2ItQzA2Q1haSEN3YUEzM0Nxa3ZvX1k5dWRaYmtsVWdLQUNBS1hlNklXN0x1ZDdqYVNwQy1mMm1peHJKbUQ1VlA5SmZDQjFKRklXekFGWldjWE1FZkxtR0drck9iRmcxd25jTWVTbkZncw?oc=5\" target=\"_blank\">Defensa revisa partidas para subir el gasto ante la OTAN por la vía rápida</a> <font color=\"#6f6f6f\">ABC.es</font>",
|
||||
"author": "Ana Sánchez",
|
||||
"newspaper": "ABC.es",
|
||||
"date": "Sat, 01 Feb 2025 03:38:34 GMT",
|
||||
"link": "https://www.abc.es/espana/defensa-revisa-partidas-subir-gasto-ante-otan-20250201194511-nt.html"
|
||||
},
|
||||
{
|
||||
"title": "La Defensa de la Trinchera: Inversión en ETFs - esRadio",
|
||||
"content": "<a href=\"https://news.google.com/rss/articles/CBMiywFBVV95cUxPZkZYYkMwakh0RVhYVkg1S0EtMHRCN0M5a2FFWEtkVXczT3F0Nm9nT0lERk1NRDFxSkI3UUxrWU9nQTRPclV0R3hVdmZVdXFhVlk4a21LZjI0akVRZGh1cmVFelVBV0UxUldtenF1YWV0VkdrZkRyN01IUDdwS1gydTFlTHBTamlRTDFTVEtXNnlNb0p4dFRnQ3ZwYWxnbUJuVXlUWi16YjVBbHdoZE9ZcDV5ZnBBZTN3TzhrVVBSTmRQeDFELVJGQlhUQQ?oc=5\" target=\"_blank\">La Defensa de la Trinchera: Inversión en ETFs</a> <font color=\"#6f6f6f\">esRadio</font>",
|
||||
"author": "Manuel Llamas",
|
||||
"newspaper": "esRadio",
|
||||
"date": "Sun, 02 Feb 2025 10:15:00 GMT",
|
||||
"link": "https://esradio.libertaddigital.com/fonoteca/2025-02-02/la-defensa-de-la-trinchera-inversion-en-etfs-p7106614-s7107392-7213634.html"
|
||||
},
|
||||
{
|
||||
"title": "La Laguna Tenerife, a por la defensa del cuarto puesto - Diario de Avisos",
|
||||
"content": "<a href=\"https://news.google.com/rss/articles/CBMioAFBVV95cUxQZDhXNC1fWWNVT3J0anktb0Q5ejNNcFM1OUNEMVNGc2FXb00yVk5SblBYRnJ0VE01Ymg3ckpYZHFhbUJpNkRfSWNVLW9NcU5GQXNKUUlQOVctdHBuTmduTFhYVS1UZURWS2hrRXl2U2UtSGtad0VYbGZxMHFCUG1lU1B6dVNsdGYwVnN0cjdBbjVsNnc3RENmMFNFblUxRGln0gHuAkFVX3lxTE41X3dNRkxoYlN5NVJuWHhtaXlXV1V1MFJwTHJVYmc1UlhXT3JpSUl6LUhFQ3BFSmRRMzY3emJHS19SSVl2OUxDVUhUUktpSU9jeEhyUElURzR3VHpubXNfUmp4M3hrdzRqV2pST3dKTlNrYUJNWlRYbjhscFo1MlMyZWYybS1QdHRTdW04QXo2SVlXbG5zeEpjdm44bXdRZHlSaXpCckZ1T2xZQzUwRG14Y3praEVBUlZERHYzOTd1WklxWGstV21tT05PMFFwMXVaczJPSGNmRUc4RkxydTZ4YXo3cnhyQ0dNc29EZU8zYXJLS3o0cExxWTVMa3I3V0NQVDVleEhSZFEzenVvUEp5Wm83aDlTQ3podWR0aFctTVdyNFQtNzQxRlMxaUlJUHFUdm13TV84Y1lOR1pjVldlcjRsUnJoQXN2THRVX3hIM0pOS1htalFudlpoVU1xNWs2TFBPYXZEbXNpUUV2UQ?oc=5\" target=\"_blank\">La Laguna Tenerife, a por la defensa del cuarto puesto</a> <font color=\"#6f6f6f\">Diario de Avisos</font>",
|
||||
"author": "Autor no encontrado en los metadatos.",
|
||||
"newspaper": "Diario de Avisos",
|
||||
"date": "Sun, 02 Feb 2025 00:15:24 GMT",
|
||||
"link": "https://diariodeavisos.elespanol.com/2025/02/la-laguna-tenerife-a-por-la-defensa-del-cuarto-puesto/"
|
||||
},
|
||||
{
|
||||
"title": "Defensa niega que exista un brote de sarna entre los soldados en Valencia y reduce los contagios a dos militares - Infobae España",
|
||||
"content": "<a href=\"https://news.google.com/rss/articles/CBMi6wFBVV95cUxQeXpTSkl6R2VrSVhVSnZKdC1hcXdjdG4wb0xRelVvV0h3M2p6UXlPaDl6dkh6d1RJN3N6WlFsR1lwR19VYVBsakJBR2p6OV9nU1AzZ0YtMk1lc1JmdmpIUG10NUdWOHY1UTh0dVFFVzRiQmVfTEZaYU5qSHV0RkVrTTRybjNsX0tvVlNRbHF5b19Jdzdob0ViMjR1ME0zOUpvekxmWXpScS1GOGZ3bjVvWUIyUllmUWZNTUw2dXRmRUNMYjJ1VGFDc1hJZ2MwYTROMmJvVGZpdXdiWENsM01Ra3dpMDVUMFhfUjZB0gGGAkFVX3lxTE9Zd0RxZW1VVWYzOWJPUEpOdVFEbGVsQ3g2MlhuTjNuSDZVdDhYd2ZOV0V5S25FMnVqUjRxeTVYZnlLejc2blNWM2JiUFdJTWlOQW1LRENlNU95dUFBb0wwc3ZpMTRFbWxlT1VwMlQ1MXpkelpENC02ei02b2l1S3pIcjNrMHVmTjZJU0g5b3B1a0NCdTU2MGdEQWZUSERmSS1wOHN2ajFmVDhTVkpkbDNWbVhKT28ya09XclltYkFlUkxoYTR1MERnRnRPMHVsbWJiZ1lGcThVYXBBMmZ3cE5kNWVDNktRakJSWnlLZ3BmWlYtc19PaU9CTFk4ZHpaZTRDa3R4YlE?oc=5\" target=\"_blank\">Defensa niega que exista un brote de sarna entre los soldados en Valencia y reduce los contagios a dos militares</a> <font color=\"#6f6f6f\">Infobae España</font>",
|
||||
"author": "Gastón Trelles",
|
||||
"newspaper": "Infobae España",
|
||||
"date": "Sat, 01 Feb 2025 12:34:00 GMT",
|
||||
"link": "https://www.infobae.com/espana/2025/02/01/defensa-niega-que-exista-un-brote-de-sarna-entre-los-soldados-en-valencia-y-reduce-los-contagios-a-dos-militares/"
|
||||
},
|
||||
{
|
||||
"title": "Ucrania rompe la defensa aérea de Putin con un triple ataque - El HuffPost",
|
||||
"content": "<a href=\"https://news.google.com/rss/articles/CBMikwFBVV95cUxPSlJ6TDF2N3R6WE5WeVZ3aHgxQU9ZM1U4TVVmMjdpTHRWVk5jTU5QSF9YcGNNdVFlaXVlRFAxUS1SeUtFb1hFS1B0ZFFEUXA4dkJaN0ZUWEFvd0FFME9LRzNCOWt0enNzc2lEZ1k3OGVlQmJsWTFZS0pyenpCcEZ1b3EwOHhKV3I4U2dRMmR0WThmZknSAacBQVVfeXFMT0V5T2FrNjRtNGNJSU9LR0d1SDNKeDFpOGpFTzBHTl9XeDU2VUZ2VXlZQy12UFk2MW9tZW5DOU9YdWFSVk1HM3R0M2FaSDQwZWRGT2h6Y3lYdlFXdERnQ183c1pWcUlxRzdEM3g4b2tZQ1hGdWF4MVlqaHZBbWd2NzNWWG1TbUhONnF1X293LUtpdGM4a092VkgzQThmaHZVdHRITnBxOFk?oc=5\" target=\"_blank\">Ucrania rompe la defensa aérea de Putin con un triple ataque</a> <font color=\"#6f6f6f\">El HuffPost</font>",
|
||||
"author": "Andrea Cadenas de Llano Sosa",
|
||||
"newspaper": "El HuffPost",
|
||||
"date": "Fri, 31 Jan 2025 16:55:20 GMT",
|
||||
"link": "https://www.huffingtonpost.es/global/ucrania-rompe-defensaerea-putin-triple-ataquebr.html"
|
||||
},
|
||||
{
|
||||
"title": "La vanidad del defensa central - 20minutos.es",
|
||||
"content": "<a href=\"https://news.google.com/rss/articles/CBMidkFVX3lxTFBtS1FzQ3h1aTZGdm4xMk9jTy0yZVJaV1V0VzNLUTlqWkh2U3RFWUZMMXZxZVNKNmUxX2Q3U0FuWTFjSkRlMnRJREJ6M3hFZk1COFpCOENPb2FZNkNvUmw2bkNnSENxazF3NDB0dVZ3MjFLOVBvSkE?oc=5\" target=\"_blank\">La vanidad del defensa central</a> <font color=\"#6f6f6f\">20minutos.es</font>",
|
||||
"author": "Juan Luis Saldaña",
|
||||
"newspaper": "20minutos.es",
|
||||
"date": "Sat, 01 Feb 2025 05:45:00 GMT",
|
||||
"link": "https://www.20minutos.es/noticia/5678058/0/vanidad-defensa-central/"
|
||||
},
|
||||
{
|
||||
"title": "Arnau Comas, refuerzo invernal para la defensa de la SD Eibar - SD Eibar",
|
||||
"content": "<a href=\"https://news.google.com/rss/articles/CBMimAFBVV95cUxQUHRmU3FaYlR6Ri1BMEVra3Ayb2RxUGlKV1psV19pSFNhY0gwd0RFZ0NpUUlVWUVLX1FaTFpENWlWcTA4N2ZIMEJiRFhCa0xLT2RmWWF3YnBWczdnaDV0MVMtMEtyTFBaNm14Y3RNY0VjdjFlNFFrY3hCSEZnTE5VUjZuMVdzWE0yQUFJN2pQb0J6WHBhaUtKaQ?oc=5\" target=\"_blank\">Arnau Comas, refuerzo invernal para la defensa de la SD Eibar</a> <font color=\"#6f6f6f\">SD Eibar</font>",
|
||||
"author": "Autor no encontrado en los metadatos.",
|
||||
"newspaper": "SD Eibar",
|
||||
"date": "Fri, 31 Jan 2025 16:19:11 GMT",
|
||||
"link": "https://www.sdeibar.com/noticias/arnau-comas-refuerzo-invernal-para-la-defensa-de-la-sd-eibar"
|
||||
}
|
||||
]
|
1169
pagina_web_sin_js.html
Normal file
1169
pagina_web_sin_js.html
Normal file
File diff suppressed because one or more lines are too long
@ -3,11 +3,48 @@ from bs4 import BeautifulSoup
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import subprocess
|
||||
from googlenewsdecoder import gnewsdecoder
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
}
|
||||
|
||||
def get_final_url(url):
|
||||
"""
|
||||
Sigue el enlace para obtener la URL final después de posibles redirecciones.
|
||||
"""
|
||||
try:
|
||||
response = requests.get(url, headers=HEADERS, allow_redirects=True)
|
||||
return response.url
|
||||
except requests.RequestException as e:
|
||||
print(f"Error al seguir el enlace: {e}")
|
||||
return url # En caso de error, devolvemos el enlace original
|
||||
|
||||
def get_author_from_script(url):
|
||||
"""
|
||||
Llama a autorsearcher.py con la URL de la noticia y devuelve el autor encontrado.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(["python", "autorsearcher.py", url], capture_output=True, text=True)
|
||||
author = result.stdout.strip()
|
||||
return author if author else "Desconocido"
|
||||
except Exception as e:
|
||||
print(f"Error al obtener el autor para {url}: {e}")
|
||||
return "Desconocido"
|
||||
|
||||
def get_url_from_google_news(url):
|
||||
interval_time = 1
|
||||
try:
|
||||
decoded_url = gnewsdecoder(url, interval=interval_time)
|
||||
|
||||
if decoded_url.get("status"):
|
||||
return decoded_url["decoded_url"]
|
||||
else:
|
||||
return "N/C"
|
||||
except Exception as e:
|
||||
print(f"Error occurred: {e}")
|
||||
|
||||
def search_news(query):
|
||||
"""
|
||||
Busca noticias relacionadas con una palabra clave en Google News.
|
||||
@ -19,12 +56,11 @@ def search_news(query):
|
||||
print(f"Error al acceder a la página para la consulta '{query}': {response.status_code}")
|
||||
return []
|
||||
|
||||
# Analizar el RSS como XML
|
||||
soup = BeautifulSoup(response.content, 'xml') # Cambié 'html.parser' por 'xml'
|
||||
articles = soup.find_all("item") # Los artículos están dentro de etiquetas <item> en RSS
|
||||
soup = BeautifulSoup(response.content, 'xml')
|
||||
articles = soup.find_all("item")
|
||||
news_list = []
|
||||
|
||||
for article in articles[:10]: # Limitar a las 10 primeras noticias
|
||||
for article in articles[:10]: # Limitar a los primeros 10 artículos
|
||||
try:
|
||||
title = article.title.get_text(strip=True)
|
||||
content = article.description.get_text(strip=True) if article.description else "Sin descripción"
|
||||
@ -32,13 +68,20 @@ def search_news(query):
|
||||
source_info = article.source.get_text(strip=True) if article.source else "Desconocido"
|
||||
date = article.pubDate.get_text(strip=True) if article.pubDate else "Fecha no disponible"
|
||||
|
||||
|
||||
# Obtener la URL final del artículo
|
||||
final_url = get_url_from_google_news(link)
|
||||
|
||||
# Obtener el autor usando autorsearcher.py
|
||||
author = get_author_from_script(final_url)
|
||||
|
||||
news_item = {
|
||||
"title": title,
|
||||
"content": content,
|
||||
"author": "Desconocido", # Autor no disponible en esta consulta
|
||||
"author": author,
|
||||
"newspaper": source_info,
|
||||
"date": date,
|
||||
"link": link
|
||||
"link": final_url # Guardamos la URL final en lugar de la de Google News
|
||||
}
|
||||
|
||||
news_list.append(news_item)
|
||||
@ -78,4 +121,3 @@ def search_from_keywords_file():
|
||||
|
||||
# Ejecutar la búsqueda desde el archivo
|
||||
search_from_keywords_file()
|
||||
|
||||
|
Reference in New Issue
Block a user