diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index f505d47..0000000 --- a/Dockerfile +++ /dev/null @@ -1,18 +0,0 @@ -# Usa una imagen oficial de Python -FROM python:3.9 - -# Configurar el directorio de trabajo en el contenedor -WORKDIR /app - -# Copiar los archivos de la aplicación al contenedor -COPY app/ /app/ - -# Instalar dependencias si es necesario (ajusta según tu requerimiento) -RUN pip install mysql-connector-python schedule - -# Copiar el archivo de crontab y configurarlo -COPY crontab.txt /etc/cron.d/crontab -RUN chmod 0644 /etc/cron.d/crontab && crontab /etc/cron.d/crontab - -# Iniciar cron y ejecutar el script en segundo plano -CMD cron && tail -f /var/log/cron.log \ No newline at end of file diff --git a/scrapper/autorsearcher.py b/app/autorsearcher.py similarity index 92% rename from scrapper/autorsearcher.py rename to app/autorsearcher.py index 13895c0..b60e648 100644 --- a/scrapper/autorsearcher.py +++ b/app/autorsearcher.py @@ -1,6 +1,5 @@ import json import requests -import sys from bs4 import BeautifulSoup import logging @@ -86,17 +85,14 @@ def get_author_from_url(url): author = get_author_from_json_ld(soup) if author: + logging.info(author) return author author = get_author_from_meta(soup) if author: + logging.info(author) return author - return "Autor no encontrado en los metadatos." + logging.info("No encontrado autor") + return "Desconocido" -if __name__ == "__main__": - if len(sys.argv) > 1: - url = sys.argv[1] - print(get_author_from_url(url)) - else: - print("Uso: python autorsearcher.py ") diff --git a/scrapper/iacorrector.py b/app/iacorrector.py similarity index 100% rename from scrapper/iacorrector.py rename to app/iacorrector.py diff --git a/app/keywords.txt b/app/keywords.txt new file mode 100644 index 0000000..6eb3ffe --- /dev/null +++ b/app/keywords.txt @@ -0,0 +1,7 @@ +Defensa +Fuerzas Armadas +CNI +Guardia Civil +Inteligencia +Policia +Ejercito \ No newline at end of file diff --git a/app/main.py b/app/main.py index 29e192f..f9103c8 100644 --- a/app/main.py +++ b/app/main.py @@ -1,12 +1,24 @@ from fastapi import FastAPI from .database import Base, engine from .routes import router +from apscheduler.schedulers.background import BackgroundScheduler +from .webscrapper import ejecutar_scrapper # Crear las tablas en MySQL si no existen Base.metadata.create_all(bind=engine) # Inicializar FastAPI app = FastAPI() +scheduler = BackgroundScheduler() # Incluir rutas app.include_router(router) + +@app.on_event("startup") +def startup_event(): + scheduler.add_job(ejecutar_scrapper, "interval", hours=24) + scheduler.start() + +@app.on_event("shutdown") +def shutdown_event(): + scheduler.shutdown() diff --git a/app/requirements.txt b/app/requirements.txt index 0bf128e..f4f4051 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -11,3 +11,5 @@ python-dotenv mysql-connector-python pymysql cryptography +lxml +apscheduler diff --git a/app/routes.py b/app/routes.py index 436c33f..2d36366 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,5 +1,6 @@ from fastapi import APIRouter, Depends, HTTPException from sqlalchemy.orm import Session +from sqlalchemy.sql import func from .database import get_db from .models import NewsItem from pydantic import BaseModel @@ -54,3 +55,196 @@ def create_news_item(item: NewsItemCreate, db: Session = Depends(get_db)): return {"message": "Noticia creada con éxito", "id": new_item.id, "titulo": new_item.titulo} +@router.get("/news/count/by-source/date-range") +def count_news_by_source_in_range( + fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db) +): + results = ( + db.query(NewsItem.fuente, func.count(NewsItem.id)) + .filter(NewsItem.fecha >= fecha_inicio, NewsItem.fecha <= fecha_fin) + .group_by(NewsItem.fuente) + .all() + ) + return {"count_by_source_in_range": results} + + +@router.get("/news/count/by-author/date-range") +def count_news_by_author_in_range( + fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db) +): + results = ( + db.query(NewsItem.autor, func.count(NewsItem.id)) + .filter(NewsItem.fecha >= fecha_inicio, NewsItem.fecha <= fecha_fin) + .group_by(NewsItem.autor) + .all() + ) + return {"count_by_author_in_range": results} + + +@router.get("/news/count/favorable/by-author/date-range") +def count_favorable_news_by_author_in_range( + fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db) +): + results = ( + db.query(NewsItem.autor, func.count(NewsItem.id)) + .filter( + NewsItem.favorable == True, + NewsItem.fecha >= fecha_inicio, + NewsItem.fecha <= fecha_fin, + ) + .group_by(NewsItem.autor) + .all() + ) + return {"favorable_count_by_author_in_range": results} + + +@router.get("/news/count/unfavorable/by-author/date-range") +def count_unfavorable_news_by_author_in_range( + fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db) +): + results = ( + db.query(NewsItem.autor, func.count(NewsItem.id)) + .filter( + NewsItem.critico == True, + NewsItem.fecha >= fecha_inicio, + NewsItem.fecha <= fecha_fin, + ) + .group_by(NewsItem.autor) + .all() + ) + return {"unfavorable_count_by_author_in_range": results} + +@router.get("/news/count/favorable/by-source/date-range") +def count_favorable_news_by_source_in_range( + fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db) +): + results = ( + db.query(NewsItem.fuente, func.count(NewsItem.id)) + .filter( + NewsItem.favorable == True, + NewsItem.fecha >= fecha_inicio, + NewsItem.fecha <= fecha_fin, + ) + .group_by(NewsItem.fuente) + .all() + ) + return {"favorable_count_by_source_in_range": results} + +@router.get("/news/count/unfavorable/by-source/date-range") +def count_unfavorable_news_by_source_in_range( + fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db) +): + results = ( + db.query(NewsItem.fuente, func.count(NewsItem.id)) + .filter( + NewsItem.critico == True, + NewsItem.fecha >= fecha_inicio, + NewsItem.fecha <= fecha_fin, + ) + .group_by(NewsItem.fuente) + .all() + ) + return {"unfavorable_count_by_source_in_range": results} + +@router.get("/news/neutral/date-range") +def get_neutral_news_in_range( + fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db) +): + results = ( + db.query(NewsItem) + .filter( + NewsItem.favorable == False, + NewsItem.critico == False, + NewsItem.fecha >= fecha_inicio, + NewsItem.fecha <= fecha_fin, + ) + .all() + ) + return results + + +@router.get("/news/mixed/date-range") +def get_mixed_news_in_range( + fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db) +): + results = ( + db.query(NewsItem) + .filter( + NewsItem.favorable == True, + NewsItem.critico == True, + NewsItem.fecha >= fecha_inicio, + NewsItem.fecha <= fecha_fin, + ) + .all() + ) + return results + +@router.get("/news/count/by-source") +def count_news_by_source(db: Session = Depends(get_db)): + results = db.query(NewsItem.fuente, func.count(NewsItem.id)).group_by(NewsItem.fuente).all() + return {"count_by_source": results} + +@router.get("/news/count/by-author") +def count_news_by_author(db: Session = Depends(get_db)): + results = db.query(NewsItem.autor, func.count(NewsItem.id)).group_by(NewsItem.autor).all() + return {"count_by_author": results} + +@router.get("/news/count/favorable/by-author") +def count_favorable_news_by_author(db: Session = Depends(get_db)): + results = ( + db.query(NewsItem.autor, func.count(NewsItem.id)) + .filter(NewsItem.favorable == True) + .group_by(NewsItem.autor) + .all() + ) + return {"favorable_count_by_author": results} + +@router.get("/news/count/unfavorable/by-author") +def count_unfavorable_news_by_author(db: Session = Depends(get_db)): + results = ( + db.query(NewsItem.autor, func.count(NewsItem.id)) + .filter(NewsItem.critico == True) + .group_by(NewsItem.autor) + .all() + ) + return {"unfavorable_count_by_author": results} + +@router.get("/news/count/favorable/by-source") +def count_favorable_news_by_source(db: Session = Depends(get_db)): + results = ( + db.query(NewsItem.fuente, func.count(NewsItem.id)) + .filter(NewsItem.favorable == True) + .group_by(NewsItem.fuente) + .all() + ) + return {"favorable_count_by_source": results} + +@router.get("/news/count/unfavorable/by-source") +def count_unfavorable_news_by_source(db: Session = Depends(get_db)): + results = ( + db.query(NewsItem.fuente, func.count(NewsItem.id)) + .filter(NewsItem.critico == True) + .group_by(NewsItem.fuente) + .all() + ) + return {"unfavorable_count_by_source": results} + +@router.get("/news/neutral") +def get_neutral_news(db: Session = Depends(get_db)): + results = ( + db.query(NewsItem) + .filter(NewsItem.favorable == False, NewsItem.critico == False) + .all() + ) + return results + + +@router.get("/news/mixed") +def get_mixed_news(db: Session = Depends(get_db)): + results = ( + db.query(NewsItem) + .filter(NewsItem.favorable == True, NewsItem.critico == True) + .all() + ) + return results + diff --git a/scrapper/webscrapper.py b/app/webscrapper.py similarity index 85% rename from scrapper/webscrapper.py rename to app/webscrapper.py index 86ce5cd..235b9d5 100644 --- a/scrapper/webscrapper.py +++ b/app/webscrapper.py @@ -1,12 +1,15 @@ import requests from bs4 import BeautifulSoup import time -import subprocess from googlenewsdecoder import gnewsdecoder -from iacorrector import is_security_related, is_critico, is_favorable # Importa la función desde iacorrector.py +from .iacorrector import is_security_related, is_critico, is_favorable # Importa la función desde iacorrector.py from datetime import datetime import pytz import logging +from .database import get_db +from sqlalchemy.orm import Session +from .routes import create_news_item, NewsItemCreate +from .autorsearcher import get_author_from_url # Configuración del logging LOG_FILE = "app.log" @@ -25,9 +28,8 @@ def get_author_from_script(url): Llama a autorsearcher.py con la URL de la noticia y devuelve el autor encontrado. """ try: - result = subprocess.run(["python", "autorsearcher.py", url], capture_output=True, text=True) - author = result.stdout.strip() - return author if author else "Desconocido" + author = get_author_from_url(url) + return author except Exception as e: logging.info(f"Error al obtener el autor para {url}: {e}") return "Desconocido" @@ -135,14 +137,22 @@ def search_news(query): return news_list def insertar_datos(news_item): - API_URL = "http://app:8000/news/" + # Obtener la sesión de base de datos usando get_db() + db: Session = next(get_db()) # Aquí obtenemos la sesión manualmente + try: + # Convertir diccionario en un objeto Pydantic + news_data = NewsItemCreate(**news_item) + + # Llamar directamente a la función que inserta noticias en la base de datos + response = create_news_item(news_data, db) - response = requests.post(API_URL, json=news_item) + logging.info(f"Noticia '{news_item['titulo']}' creada con éxito. ID: {response['id']}") - if response.status_code == 200: - logging.info(f"Noticia '{news_item['titulo']}' creada con éxito.") - else: - logging.info(f"Error al insertar '{news_item['titulo']}':", response.status_code, response.json()) + except Exception as e: + logging.error(f"Error al insertar '{news_item['titulo']}': {str(e)}") + + finally: + db.close() # Cerrar la sesión después de su uso def search_from_keywords_file(): """ @@ -151,7 +161,7 @@ def search_from_keywords_file(): all_news = [] # Lista para almacenar todas las noticias recolectadas try: - with open("keywords.txt", "r", encoding="utf-8") as file: + with open("/app/keywords.txt", "r", encoding="utf-8") as file: keywords = file.readlines() # Eliminar posibles saltos de línea y espacios extra diff --git a/crontab.txt b/crontab.txt deleted file mode 100644 index 7935aee..0000000 --- a/crontab.txt +++ /dev/null @@ -1 +0,0 @@ -0 1 * * * python3 /app/main.py >> /var/log/cron.log 2>&1 diff --git a/docker-compose.yml b/docker-compose.yml index 4787f65..1fa6836 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,6 +12,8 @@ services: - MYSQL_PASSWORD=${MYSQL_PASSWORD} - MYSQL_DATABASE=${MYSQL_DATABASE} - MYSQL_PORT=${MYSQL_PORT} + - OLLAMA_URL=${OLLAMA_URL} + - OLLAMA_MODEL=${OLLAMA_MODEL} depends_on: mysql: condition: service_healthy @@ -36,20 +38,6 @@ services: timeout: 5s retries: 10 - scrapper: - build: - context: ./scrapper - dockerfile: Dockerfile - container_name: scrapper - depends_on: - - app - environment: - - OLLAMA_URL=${OLLAMA_URL} - - OLLAMA_MODEL=${OLLAMA_MODEL} - volumes: - - ./scrapper:/app/scrapper - - ./crontab.txt:/etc/crontabs/root - # metabase: # image: metabase/metabase:latest # container_name: metabase diff --git a/init.sql b/init.sql deleted file mode 100644 index cd15893..0000000 --- a/init.sql +++ /dev/null @@ -1,12 +0,0 @@ -CREATE DATABASE IF NOT EXISTS scraper_db; -USE scraper_db; - -CREATE TABLE IF NOT EXISTS noticias ( - id INT AUTO_INCREMENT PRIMARY KEY, - titulo VARCHAR(255) NOT NULL, - contenido TEXT, - autor VARCHAR(255), - fuente VARCHAR(255), - fecha DATETIME, - link TEXT -); \ No newline at end of file diff --git a/scrapper/Dockerfile b/scrapper/Dockerfile deleted file mode 100644 index 42d1fff..0000000 --- a/scrapper/Dockerfile +++ /dev/null @@ -1,20 +0,0 @@ -FROM python:latest -RUN apt-get update && apt-get install -y cron - -WORKDIR ./ - -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -COPY . . -# Copia el archivo crontab al contenedor -COPY crontab.txt /etc/cron.d/my_cron_job - -# Asigna los permisos adecuados -RUN chmod 0644 /etc/cron.d/my_cron_job - -# Asegura que el archivo se procese por cron -RUN touch /var/log/cron.log && chmod 0666 /var/log/cron.log - - -CMD ["sh", "-c", "cron -f"] diff --git a/scrapper/app.log b/scrapper/app.log deleted file mode 100644 index 8b13789..0000000 --- a/scrapper/app.log +++ /dev/null @@ -1 +0,0 @@ - diff --git a/scrapper/crontab.txt b/scrapper/crontab.txt deleted file mode 100644 index f34555c..0000000 --- a/scrapper/crontab.txt +++ /dev/null @@ -1 +0,0 @@ -0 1 * * * python3 /app/scrapper/webscrapper.py >> /var/log/cron.log 2>&1 #modificar para ajustar \ No newline at end of file diff --git a/scrapper/keywords.txt b/scrapper/keywords.txt deleted file mode 100644 index 601c5a5..0000000 --- a/scrapper/keywords.txt +++ /dev/null @@ -1 +0,0 @@ -Defensa \ No newline at end of file diff --git a/scrapper/requirements.txt b/scrapper/requirements.txt deleted file mode 100644 index a2fde0f..0000000 --- a/scrapper/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -fastapi -uvicorn -requests -beautifulsoup4 -googlenewsdecoder -pytz -logging -sqlalchemy -pydantic -python-dotenv -lxml