testing
This commit is contained in:
18
Dockerfile
18
Dockerfile
@ -1,18 +0,0 @@
|
||||
# Usa una imagen oficial de Python
|
||||
FROM python:3.9
|
||||
|
||||
# Configurar el directorio de trabajo en el contenedor
|
||||
WORKDIR /app
|
||||
|
||||
# Copiar los archivos de la aplicación al contenedor
|
||||
COPY app/ /app/
|
||||
|
||||
# Instalar dependencias si es necesario (ajusta según tu requerimiento)
|
||||
RUN pip install mysql-connector-python schedule
|
||||
|
||||
# Copiar el archivo de crontab y configurarlo
|
||||
COPY crontab.txt /etc/cron.d/crontab
|
||||
RUN chmod 0644 /etc/cron.d/crontab && crontab /etc/cron.d/crontab
|
||||
|
||||
# Iniciar cron y ejecutar el script en segundo plano
|
||||
CMD cron && tail -f /var/log/cron.log
|
@ -1,6 +1,5 @@
|
||||
import json
|
||||
import requests
|
||||
import sys
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
|
||||
@ -86,17 +85,14 @@ def get_author_from_url(url):
|
||||
|
||||
author = get_author_from_json_ld(soup)
|
||||
if author:
|
||||
logging.info(author)
|
||||
return author
|
||||
|
||||
author = get_author_from_meta(soup)
|
||||
if author:
|
||||
logging.info(author)
|
||||
return author
|
||||
|
||||
return "Autor no encontrado en los metadatos."
|
||||
logging.info("No encontrado autor")
|
||||
return "Desconocido"
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1:
|
||||
url = sys.argv[1]
|
||||
print(get_author_from_url(url))
|
||||
else:
|
||||
print("Uso: python autorsearcher.py <URL>")
|
7
app/keywords.txt
Normal file
7
app/keywords.txt
Normal file
@ -0,0 +1,7 @@
|
||||
Defensa
|
||||
Fuerzas Armadas
|
||||
CNI
|
||||
Guardia Civil
|
||||
Inteligencia
|
||||
Policia
|
||||
Ejercito
|
12
app/main.py
12
app/main.py
@ -1,12 +1,24 @@
|
||||
from fastapi import FastAPI
|
||||
from .database import Base, engine
|
||||
from .routes import router
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from .webscrapper import ejecutar_scrapper
|
||||
|
||||
# Crear las tablas en MySQL si no existen
|
||||
Base.metadata.create_all(bind=engine)
|
||||
|
||||
# Inicializar FastAPI
|
||||
app = FastAPI()
|
||||
scheduler = BackgroundScheduler()
|
||||
|
||||
# Incluir rutas
|
||||
app.include_router(router)
|
||||
|
||||
@app.on_event("startup")
|
||||
def startup_event():
|
||||
scheduler.add_job(ejecutar_scrapper, "interval", hours=24)
|
||||
scheduler.start()
|
||||
|
||||
@app.on_event("shutdown")
|
||||
def shutdown_event():
|
||||
scheduler.shutdown()
|
||||
|
@ -11,3 +11,5 @@ python-dotenv
|
||||
mysql-connector-python
|
||||
pymysql
|
||||
cryptography
|
||||
lxml
|
||||
apscheduler
|
||||
|
194
app/routes.py
194
app/routes.py
@ -1,5 +1,6 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy.sql import func
|
||||
from .database import get_db
|
||||
from .models import NewsItem
|
||||
from pydantic import BaseModel
|
||||
@ -54,3 +55,196 @@ def create_news_item(item: NewsItemCreate, db: Session = Depends(get_db)):
|
||||
|
||||
return {"message": "Noticia creada con éxito", "id": new_item.id, "titulo": new_item.titulo}
|
||||
|
||||
@router.get("/news/count/by-source/date-range")
|
||||
def count_news_by_source_in_range(
|
||||
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
|
||||
):
|
||||
results = (
|
||||
db.query(NewsItem.fuente, func.count(NewsItem.id))
|
||||
.filter(NewsItem.fecha >= fecha_inicio, NewsItem.fecha <= fecha_fin)
|
||||
.group_by(NewsItem.fuente)
|
||||
.all()
|
||||
)
|
||||
return {"count_by_source_in_range": results}
|
||||
|
||||
|
||||
@router.get("/news/count/by-author/date-range")
|
||||
def count_news_by_author_in_range(
|
||||
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
|
||||
):
|
||||
results = (
|
||||
db.query(NewsItem.autor, func.count(NewsItem.id))
|
||||
.filter(NewsItem.fecha >= fecha_inicio, NewsItem.fecha <= fecha_fin)
|
||||
.group_by(NewsItem.autor)
|
||||
.all()
|
||||
)
|
||||
return {"count_by_author_in_range": results}
|
||||
|
||||
|
||||
@router.get("/news/count/favorable/by-author/date-range")
|
||||
def count_favorable_news_by_author_in_range(
|
||||
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
|
||||
):
|
||||
results = (
|
||||
db.query(NewsItem.autor, func.count(NewsItem.id))
|
||||
.filter(
|
||||
NewsItem.favorable == True,
|
||||
NewsItem.fecha >= fecha_inicio,
|
||||
NewsItem.fecha <= fecha_fin,
|
||||
)
|
||||
.group_by(NewsItem.autor)
|
||||
.all()
|
||||
)
|
||||
return {"favorable_count_by_author_in_range": results}
|
||||
|
||||
|
||||
@router.get("/news/count/unfavorable/by-author/date-range")
|
||||
def count_unfavorable_news_by_author_in_range(
|
||||
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
|
||||
):
|
||||
results = (
|
||||
db.query(NewsItem.autor, func.count(NewsItem.id))
|
||||
.filter(
|
||||
NewsItem.critico == True,
|
||||
NewsItem.fecha >= fecha_inicio,
|
||||
NewsItem.fecha <= fecha_fin,
|
||||
)
|
||||
.group_by(NewsItem.autor)
|
||||
.all()
|
||||
)
|
||||
return {"unfavorable_count_by_author_in_range": results}
|
||||
|
||||
@router.get("/news/count/favorable/by-source/date-range")
|
||||
def count_favorable_news_by_source_in_range(
|
||||
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
|
||||
):
|
||||
results = (
|
||||
db.query(NewsItem.fuente, func.count(NewsItem.id))
|
||||
.filter(
|
||||
NewsItem.favorable == True,
|
||||
NewsItem.fecha >= fecha_inicio,
|
||||
NewsItem.fecha <= fecha_fin,
|
||||
)
|
||||
.group_by(NewsItem.fuente)
|
||||
.all()
|
||||
)
|
||||
return {"favorable_count_by_source_in_range": results}
|
||||
|
||||
@router.get("/news/count/unfavorable/by-source/date-range")
|
||||
def count_unfavorable_news_by_source_in_range(
|
||||
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
|
||||
):
|
||||
results = (
|
||||
db.query(NewsItem.fuente, func.count(NewsItem.id))
|
||||
.filter(
|
||||
NewsItem.critico == True,
|
||||
NewsItem.fecha >= fecha_inicio,
|
||||
NewsItem.fecha <= fecha_fin,
|
||||
)
|
||||
.group_by(NewsItem.fuente)
|
||||
.all()
|
||||
)
|
||||
return {"unfavorable_count_by_source_in_range": results}
|
||||
|
||||
@router.get("/news/neutral/date-range")
|
||||
def get_neutral_news_in_range(
|
||||
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
|
||||
):
|
||||
results = (
|
||||
db.query(NewsItem)
|
||||
.filter(
|
||||
NewsItem.favorable == False,
|
||||
NewsItem.critico == False,
|
||||
NewsItem.fecha >= fecha_inicio,
|
||||
NewsItem.fecha <= fecha_fin,
|
||||
)
|
||||
.all()
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
@router.get("/news/mixed/date-range")
|
||||
def get_mixed_news_in_range(
|
||||
fecha_inicio: datetime, fecha_fin: datetime, db: Session = Depends(get_db)
|
||||
):
|
||||
results = (
|
||||
db.query(NewsItem)
|
||||
.filter(
|
||||
NewsItem.favorable == True,
|
||||
NewsItem.critico == True,
|
||||
NewsItem.fecha >= fecha_inicio,
|
||||
NewsItem.fecha <= fecha_fin,
|
||||
)
|
||||
.all()
|
||||
)
|
||||
return results
|
||||
|
||||
@router.get("/news/count/by-source")
|
||||
def count_news_by_source(db: Session = Depends(get_db)):
|
||||
results = db.query(NewsItem.fuente, func.count(NewsItem.id)).group_by(NewsItem.fuente).all()
|
||||
return {"count_by_source": results}
|
||||
|
||||
@router.get("/news/count/by-author")
|
||||
def count_news_by_author(db: Session = Depends(get_db)):
|
||||
results = db.query(NewsItem.autor, func.count(NewsItem.id)).group_by(NewsItem.autor).all()
|
||||
return {"count_by_author": results}
|
||||
|
||||
@router.get("/news/count/favorable/by-author")
|
||||
def count_favorable_news_by_author(db: Session = Depends(get_db)):
|
||||
results = (
|
||||
db.query(NewsItem.autor, func.count(NewsItem.id))
|
||||
.filter(NewsItem.favorable == True)
|
||||
.group_by(NewsItem.autor)
|
||||
.all()
|
||||
)
|
||||
return {"favorable_count_by_author": results}
|
||||
|
||||
@router.get("/news/count/unfavorable/by-author")
|
||||
def count_unfavorable_news_by_author(db: Session = Depends(get_db)):
|
||||
results = (
|
||||
db.query(NewsItem.autor, func.count(NewsItem.id))
|
||||
.filter(NewsItem.critico == True)
|
||||
.group_by(NewsItem.autor)
|
||||
.all()
|
||||
)
|
||||
return {"unfavorable_count_by_author": results}
|
||||
|
||||
@router.get("/news/count/favorable/by-source")
|
||||
def count_favorable_news_by_source(db: Session = Depends(get_db)):
|
||||
results = (
|
||||
db.query(NewsItem.fuente, func.count(NewsItem.id))
|
||||
.filter(NewsItem.favorable == True)
|
||||
.group_by(NewsItem.fuente)
|
||||
.all()
|
||||
)
|
||||
return {"favorable_count_by_source": results}
|
||||
|
||||
@router.get("/news/count/unfavorable/by-source")
|
||||
def count_unfavorable_news_by_source(db: Session = Depends(get_db)):
|
||||
results = (
|
||||
db.query(NewsItem.fuente, func.count(NewsItem.id))
|
||||
.filter(NewsItem.critico == True)
|
||||
.group_by(NewsItem.fuente)
|
||||
.all()
|
||||
)
|
||||
return {"unfavorable_count_by_source": results}
|
||||
|
||||
@router.get("/news/neutral")
|
||||
def get_neutral_news(db: Session = Depends(get_db)):
|
||||
results = (
|
||||
db.query(NewsItem)
|
||||
.filter(NewsItem.favorable == False, NewsItem.critico == False)
|
||||
.all()
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
@router.get("/news/mixed")
|
||||
def get_mixed_news(db: Session = Depends(get_db)):
|
||||
results = (
|
||||
db.query(NewsItem)
|
||||
.filter(NewsItem.favorable == True, NewsItem.critico == True)
|
||||
.all()
|
||||
)
|
||||
return results
|
||||
|
||||
|
@ -1,12 +1,15 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
import subprocess
|
||||
from googlenewsdecoder import gnewsdecoder
|
||||
from iacorrector import is_security_related, is_critico, is_favorable # Importa la función desde iacorrector.py
|
||||
from .iacorrector import is_security_related, is_critico, is_favorable # Importa la función desde iacorrector.py
|
||||
from datetime import datetime
|
||||
import pytz
|
||||
import logging
|
||||
from .database import get_db
|
||||
from sqlalchemy.orm import Session
|
||||
from .routes import create_news_item, NewsItemCreate
|
||||
from .autorsearcher import get_author_from_url
|
||||
|
||||
# Configuración del logging
|
||||
LOG_FILE = "app.log"
|
||||
@ -25,9 +28,8 @@ def get_author_from_script(url):
|
||||
Llama a autorsearcher.py con la URL de la noticia y devuelve el autor encontrado.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(["python", "autorsearcher.py", url], capture_output=True, text=True)
|
||||
author = result.stdout.strip()
|
||||
return author if author else "Desconocido"
|
||||
author = get_author_from_url(url)
|
||||
return author
|
||||
except Exception as e:
|
||||
logging.info(f"Error al obtener el autor para {url}: {e}")
|
||||
return "Desconocido"
|
||||
@ -135,14 +137,22 @@ def search_news(query):
|
||||
return news_list
|
||||
|
||||
def insertar_datos(news_item):
|
||||
API_URL = "http://app:8000/news/"
|
||||
# Obtener la sesión de base de datos usando get_db()
|
||||
db: Session = next(get_db()) # Aquí obtenemos la sesión manualmente
|
||||
try:
|
||||
# Convertir diccionario en un objeto Pydantic
|
||||
news_data = NewsItemCreate(**news_item)
|
||||
|
||||
response = requests.post(API_URL, json=news_item)
|
||||
# Llamar directamente a la función que inserta noticias en la base de datos
|
||||
response = create_news_item(news_data, db)
|
||||
|
||||
if response.status_code == 200:
|
||||
logging.info(f"Noticia '{news_item['titulo']}' creada con éxito.")
|
||||
else:
|
||||
logging.info(f"Error al insertar '{news_item['titulo']}':", response.status_code, response.json())
|
||||
logging.info(f"Noticia '{news_item['titulo']}' creada con éxito. ID: {response['id']}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error al insertar '{news_item['titulo']}': {str(e)}")
|
||||
|
||||
finally:
|
||||
db.close() # Cerrar la sesión después de su uso
|
||||
|
||||
def search_from_keywords_file():
|
||||
"""
|
||||
@ -151,7 +161,7 @@ def search_from_keywords_file():
|
||||
all_news = [] # Lista para almacenar todas las noticias recolectadas
|
||||
|
||||
try:
|
||||
with open("keywords.txt", "r", encoding="utf-8") as file:
|
||||
with open("/app/keywords.txt", "r", encoding="utf-8") as file:
|
||||
keywords = file.readlines()
|
||||
|
||||
# Eliminar posibles saltos de línea y espacios extra
|
@ -1 +0,0 @@
|
||||
0 1 * * * python3 /app/main.py >> /var/log/cron.log 2>&1
|
@ -12,6 +12,8 @@ services:
|
||||
- MYSQL_PASSWORD=${MYSQL_PASSWORD}
|
||||
- MYSQL_DATABASE=${MYSQL_DATABASE}
|
||||
- MYSQL_PORT=${MYSQL_PORT}
|
||||
- OLLAMA_URL=${OLLAMA_URL}
|
||||
- OLLAMA_MODEL=${OLLAMA_MODEL}
|
||||
depends_on:
|
||||
mysql:
|
||||
condition: service_healthy
|
||||
@ -36,20 +38,6 @@ services:
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
|
||||
scrapper:
|
||||
build:
|
||||
context: ./scrapper
|
||||
dockerfile: Dockerfile
|
||||
container_name: scrapper
|
||||
depends_on:
|
||||
- app
|
||||
environment:
|
||||
- OLLAMA_URL=${OLLAMA_URL}
|
||||
- OLLAMA_MODEL=${OLLAMA_MODEL}
|
||||
volumes:
|
||||
- ./scrapper:/app/scrapper
|
||||
- ./crontab.txt:/etc/crontabs/root
|
||||
|
||||
# metabase:
|
||||
# image: metabase/metabase:latest
|
||||
# container_name: metabase
|
||||
|
12
init.sql
12
init.sql
@ -1,12 +0,0 @@
|
||||
CREATE DATABASE IF NOT EXISTS scraper_db;
|
||||
USE scraper_db;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS noticias (
|
||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
titulo VARCHAR(255) NOT NULL,
|
||||
contenido TEXT,
|
||||
autor VARCHAR(255),
|
||||
fuente VARCHAR(255),
|
||||
fecha DATETIME,
|
||||
link TEXT
|
||||
);
|
@ -1,20 +0,0 @@
|
||||
FROM python:latest
|
||||
RUN apt-get update && apt-get install -y cron
|
||||
|
||||
WORKDIR ./
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY . .
|
||||
# Copia el archivo crontab al contenedor
|
||||
COPY crontab.txt /etc/cron.d/my_cron_job
|
||||
|
||||
# Asigna los permisos adecuados
|
||||
RUN chmod 0644 /etc/cron.d/my_cron_job
|
||||
|
||||
# Asegura que el archivo se procese por cron
|
||||
RUN touch /var/log/cron.log && chmod 0666 /var/log/cron.log
|
||||
|
||||
|
||||
CMD ["sh", "-c", "cron -f"]
|
@ -1 +0,0 @@
|
||||
|
@ -1 +0,0 @@
|
||||
0 1 * * * python3 /app/scrapper/webscrapper.py >> /var/log/cron.log 2>&1 #modificar para ajustar
|
@ -1 +0,0 @@
|
||||
Defensa
|
@ -1,11 +0,0 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
requests
|
||||
beautifulsoup4
|
||||
googlenewsdecoder
|
||||
pytz
|
||||
logging
|
||||
sqlalchemy
|
||||
pydantic
|
||||
python-dotenv
|
||||
lxml
|
Reference in New Issue
Block a user