This commit is contained in:
Your Name
2025-02-09 21:56:13 +01:00
parent 94da85702f
commit b4b2d899aa
16 changed files with 243 additions and 99 deletions

View File

@ -1,18 +0,0 @@
# Usa una imagen oficial de Python
FROM python:3.9
# Configurar el directorio de trabajo en el contenedor
WORKDIR /app
# Copiar los archivos de la aplicación al contenedor
COPY app/ /app/
# Instalar dependencias si es necesario (ajusta según tu requerimiento)
RUN pip install mysql-connector-python schedule
# Copiar el archivo de crontab y configurarlo
COPY crontab.txt /etc/cron.d/crontab
RUN chmod 0644 /etc/cron.d/crontab && crontab /etc/cron.d/crontab
# Iniciar cron y ejecutar el script en segundo plano
CMD cron && tail -f /var/log/cron.log

View File

@ -1,6 +1,5 @@
import json import json
import requests import requests
import sys
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import logging import logging
@ -86,17 +85,14 @@ def get_author_from_url(url):
author = get_author_from_json_ld(soup) author = get_author_from_json_ld(soup)
if author: if author:
logging.info(author)
return author return author
author = get_author_from_meta(soup) author = get_author_from_meta(soup)
if author: if author:
logging.info(author)
return author return author
return "Autor no encontrado en los metadatos." logging.info("No encontrado autor")
return "Desconocido"
if __name__ == "__main__":
if len(sys.argv) > 1:
url = sys.argv[1]
print(get_author_from_url(url))
else:
print("Uso: python autorsearcher.py <URL>")

7
app/keywords.txt Normal file
View File

@ -0,0 +1,7 @@
Defensa
Fuerzas Armadas
CNI
Guardia Civil
Inteligencia
Policia
Ejercito

View File

@ -1,12 +1,24 @@
from fastapi import FastAPI from fastapi import FastAPI
from .database import Base, engine from .database import Base, engine
from .routes import router from .routes import router
from apscheduler.schedulers.background import BackgroundScheduler
from .webscrapper import ejecutar_scrapper
# Crear las tablas en MySQL si no existen # Crear las tablas en MySQL si no existen
Base.metadata.create_all(bind=engine) Base.metadata.create_all(bind=engine)
# Inicializar FastAPI # Inicializar FastAPI
app = FastAPI() app = FastAPI()
scheduler = BackgroundScheduler()
# Incluir rutas # Incluir rutas
app.include_router(router) app.include_router(router)
@app.on_event("startup")
def startup_event():
scheduler.add_job(ejecutar_scrapper, "interval", hours=24)
scheduler.start()
@app.on_event("shutdown")
def shutdown_event():
scheduler.shutdown()

View File

@ -11,3 +11,5 @@ python-dotenv
mysql-connector-python mysql-connector-python
pymysql pymysql
cryptography cryptography
lxml
apscheduler

View File

@ -1,5 +1,6 @@
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from sqlalchemy.sql import func
from .database import get_db from .database import get_db
from .models import NewsItem from .models import NewsItem
from pydantic import BaseModel from pydantic import BaseModel
@ -54,3 +55,196 @@ def create_news_item(item: NewsItemCreate, db: Session = Depends(get_db)):
return {"message": "Noticia creada con éxito", "id": new_item.id, "titulo": new_item.titulo} return {"message": "Noticia creada con éxito", "id": new_item.id, "titulo": new_item.titulo}
@router.get("/news/count/by-source/date-range")
def count_news_by_source_in_range(
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
):
results = (
db.query(NewsItem.fuente, func.count(NewsItem.id))
.filter(NewsItem.fecha >= fecha_inicio, NewsItem.fecha <= fecha_fin)
.group_by(NewsItem.fuente)
.all()
)
return {"count_by_source_in_range": results}
@router.get("/news/count/by-author/date-range")
def count_news_by_author_in_range(
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
):
results = (
db.query(NewsItem.autor, func.count(NewsItem.id))
.filter(NewsItem.fecha >= fecha_inicio, NewsItem.fecha <= fecha_fin)
.group_by(NewsItem.autor)
.all()
)
return {"count_by_author_in_range": results}
@router.get("/news/count/favorable/by-author/date-range")
def count_favorable_news_by_author_in_range(
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
):
results = (
db.query(NewsItem.autor, func.count(NewsItem.id))
.filter(
NewsItem.favorable == True,
NewsItem.fecha >= fecha_inicio,
NewsItem.fecha <= fecha_fin,
)
.group_by(NewsItem.autor)
.all()
)
return {"favorable_count_by_author_in_range": results}
@router.get("/news/count/unfavorable/by-author/date-range")
def count_unfavorable_news_by_author_in_range(
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
):
results = (
db.query(NewsItem.autor, func.count(NewsItem.id))
.filter(
NewsItem.critico == True,
NewsItem.fecha >= fecha_inicio,
NewsItem.fecha <= fecha_fin,
)
.group_by(NewsItem.autor)
.all()
)
return {"unfavorable_count_by_author_in_range": results}
@router.get("/news/count/favorable/by-source/date-range")
def count_favorable_news_by_source_in_range(
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
):
results = (
db.query(NewsItem.fuente, func.count(NewsItem.id))
.filter(
NewsItem.favorable == True,
NewsItem.fecha >= fecha_inicio,
NewsItem.fecha <= fecha_fin,
)
.group_by(NewsItem.fuente)
.all()
)
return {"favorable_count_by_source_in_range": results}
@router.get("/news/count/unfavorable/by-source/date-range")
def count_unfavorable_news_by_source_in_range(
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
):
results = (
db.query(NewsItem.fuente, func.count(NewsItem.id))
.filter(
NewsItem.critico == True,
NewsItem.fecha >= fecha_inicio,
NewsItem.fecha <= fecha_fin,
)
.group_by(NewsItem.fuente)
.all()
)
return {"unfavorable_count_by_source_in_range": results}
@router.get("/news/neutral/date-range")
def get_neutral_news_in_range(
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
):
results = (
db.query(NewsItem)
.filter(
NewsItem.favorable == False,
NewsItem.critico == False,
NewsItem.fecha >= fecha_inicio,
NewsItem.fecha <= fecha_fin,
)
.all()
)
return results
@router.get("/news/mixed/date-range")
def get_mixed_news_in_range(
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
):
results = (
db.query(NewsItem)
.filter(
NewsItem.favorable == True,
NewsItem.critico == True,
NewsItem.fecha >= fecha_inicio,
NewsItem.fecha <= fecha_fin,
)
.all()
)
return results
@router.get("/news/count/by-source")
def count_news_by_source(db: Session = Depends(get_db)):
results = db.query(NewsItem.fuente, func.count(NewsItem.id)).group_by(NewsItem.fuente).all()
return {"count_by_source": results}
@router.get("/news/count/by-author")
def count_news_by_author(db: Session = Depends(get_db)):
results = db.query(NewsItem.autor, func.count(NewsItem.id)).group_by(NewsItem.autor).all()
return {"count_by_author": results}
@router.get("/news/count/favorable/by-author")
def count_favorable_news_by_author(db: Session = Depends(get_db)):
results = (
db.query(NewsItem.autor, func.count(NewsItem.id))
.filter(NewsItem.favorable == True)
.group_by(NewsItem.autor)
.all()
)
return {"favorable_count_by_author": results}
@router.get("/news/count/unfavorable/by-author")
def count_unfavorable_news_by_author(db: Session = Depends(get_db)):
results = (
db.query(NewsItem.autor, func.count(NewsItem.id))
.filter(NewsItem.critico == True)
.group_by(NewsItem.autor)
.all()
)
return {"unfavorable_count_by_author": results}
@router.get("/news/count/favorable/by-source")
def count_favorable_news_by_source(db: Session = Depends(get_db)):
results = (
db.query(NewsItem.fuente, func.count(NewsItem.id))
.filter(NewsItem.favorable == True)
.group_by(NewsItem.fuente)
.all()
)
return {"favorable_count_by_source": results}
@router.get("/news/count/unfavorable/by-source")
def count_unfavorable_news_by_source(db: Session = Depends(get_db)):
results = (
db.query(NewsItem.fuente, func.count(NewsItem.id))
.filter(NewsItem.critico == True)
.group_by(NewsItem.fuente)
.all()
)
return {"unfavorable_count_by_source": results}
@router.get("/news/neutral")
def get_neutral_news(db: Session = Depends(get_db)):
results = (
db.query(NewsItem)
.filter(NewsItem.favorable == False, NewsItem.critico == False)
.all()
)
return results
@router.get("/news/mixed")
def get_mixed_news(db: Session = Depends(get_db)):
results = (
db.query(NewsItem)
.filter(NewsItem.favorable == True, NewsItem.critico == True)
.all()
)
return results

View File

@ -1,12 +1,15 @@
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import time import time
import subprocess
from googlenewsdecoder import gnewsdecoder from googlenewsdecoder import gnewsdecoder
from iacorrector import is_security_related, is_critico, is_favorable # Importa la función desde iacorrector.py from .iacorrector import is_security_related, is_critico, is_favorable # Importa la función desde iacorrector.py
from datetime import datetime from datetime import datetime
import pytz import pytz
import logging import logging
from .database import get_db
from sqlalchemy.orm import Session
from .routes import create_news_item, NewsItemCreate
from .autorsearcher import get_author_from_url
# Configuración del logging # Configuración del logging
LOG_FILE = "app.log" LOG_FILE = "app.log"
@ -25,9 +28,8 @@ def get_author_from_script(url):
Llama a autorsearcher.py con la URL de la noticia y devuelve el autor encontrado. Llama a autorsearcher.py con la URL de la noticia y devuelve el autor encontrado.
""" """
try: try:
result = subprocess.run(["python", "autorsearcher.py", url], capture_output=True, text=True) author = get_author_from_url(url)
author = result.stdout.strip() return author
return author if author else "Desconocido"
except Exception as e: except Exception as e:
logging.info(f"Error al obtener el autor para {url}: {e}") logging.info(f"Error al obtener el autor para {url}: {e}")
return "Desconocido" return "Desconocido"
@ -135,14 +137,22 @@ def search_news(query):
return news_list return news_list
def insertar_datos(news_item): def insertar_datos(news_item):
API_URL = "http://app:8000/news/" # Obtener la sesión de base de datos usando get_db()
db: Session = next(get_db()) # Aquí obtenemos la sesión manualmente
try:
# Convertir diccionario en un objeto Pydantic
news_data = NewsItemCreate(**news_item)
# Llamar directamente a la función que inserta noticias en la base de datos
response = create_news_item(news_data, db)
response = requests.post(API_URL, json=news_item) logging.info(f"Noticia '{news_item['titulo']}' creada con éxito. ID: {response['id']}")
if response.status_code == 200: except Exception as e:
logging.info(f"Noticia '{news_item['titulo']}' creada con éxito.") logging.error(f"Error al insertar '{news_item['titulo']}': {str(e)}")
else:
logging.info(f"Error al insertar '{news_item['titulo']}':", response.status_code, response.json()) finally:
db.close() # Cerrar la sesión después de su uso
def search_from_keywords_file(): def search_from_keywords_file():
""" """
@ -151,7 +161,7 @@ def search_from_keywords_file():
all_news = [] # Lista para almacenar todas las noticias recolectadas all_news = [] # Lista para almacenar todas las noticias recolectadas
try: try:
with open("keywords.txt", "r", encoding="utf-8") as file: with open("/app/keywords.txt", "r", encoding="utf-8") as file:
keywords = file.readlines() keywords = file.readlines()
# Eliminar posibles saltos de línea y espacios extra # Eliminar posibles saltos de línea y espacios extra

View File

@ -1 +0,0 @@
0 1 * * * python3 /app/main.py >> /var/log/cron.log 2>&1

View File

@ -12,6 +12,8 @@ services:
- MYSQL_PASSWORD=${MYSQL_PASSWORD} - MYSQL_PASSWORD=${MYSQL_PASSWORD}
- MYSQL_DATABASE=${MYSQL_DATABASE} - MYSQL_DATABASE=${MYSQL_DATABASE}
- MYSQL_PORT=${MYSQL_PORT} - MYSQL_PORT=${MYSQL_PORT}
- OLLAMA_URL=${OLLAMA_URL}
- OLLAMA_MODEL=${OLLAMA_MODEL}
depends_on: depends_on:
mysql: mysql:
condition: service_healthy condition: service_healthy
@ -36,20 +38,6 @@ services:
timeout: 5s timeout: 5s
retries: 10 retries: 10
scrapper:
build:
context: ./scrapper
dockerfile: Dockerfile
container_name: scrapper
depends_on:
- app
environment:
- OLLAMA_URL=${OLLAMA_URL}
- OLLAMA_MODEL=${OLLAMA_MODEL}
volumes:
- ./scrapper:/app/scrapper
- ./crontab.txt:/etc/crontabs/root
# metabase: # metabase:
# image: metabase/metabase:latest # image: metabase/metabase:latest
# container_name: metabase # container_name: metabase

View File

@ -1,12 +0,0 @@
CREATE DATABASE IF NOT EXISTS scraper_db;
USE scraper_db;
CREATE TABLE IF NOT EXISTS noticias (
id INT AUTO_INCREMENT PRIMARY KEY,
titulo VARCHAR(255) NOT NULL,
contenido TEXT,
autor VARCHAR(255),
fuente VARCHAR(255),
fecha DATETIME,
link TEXT
);

View File

@ -1,20 +0,0 @@
FROM python:latest
RUN apt-get update && apt-get install -y cron
WORKDIR ./
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
# Copia el archivo crontab al contenedor
COPY crontab.txt /etc/cron.d/my_cron_job
# Asigna los permisos adecuados
RUN chmod 0644 /etc/cron.d/my_cron_job
# Asegura que el archivo se procese por cron
RUN touch /var/log/cron.log && chmod 0666 /var/log/cron.log
CMD ["sh", "-c", "cron -f"]

View File

@ -1 +0,0 @@

View File

@ -1 +0,0 @@
0 1 * * * python3 /app/scrapper/webscrapper.py >> /var/log/cron.log 2>&1 #modificar para ajustar

View File

@ -1 +0,0 @@
Defensa

View File

@ -1,11 +0,0 @@
fastapi
uvicorn
requests
beautifulsoup4
googlenewsdecoder
pytz
logging
sqlalchemy
pydantic
python-dotenv
lxml