docker ready sin pruebas

This commit is contained in:
Your Name
2025-02-06 17:19:30 +01:00
parent 03d334fade
commit 85228eeda9
8 changed files with 159 additions and 95 deletions

View File

@ -1,18 +1,22 @@
# Usa una imagen oficial de Python
FROM python:3.9
FROM python:3.11-slim
# Configurar el directorio de trabajo en el contenedor
WORKDIR /app
# Copiar los archivos de la aplicación al contenedor
COPY app/ /app/
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Instalar dependencias si es necesario (ajusta según tu requerimiento)
RUN pip install mysql-connector-python schedule
COPY . .
# Copiar el archivo de crontab y configurarlo
COPY crontab.txt /etc/cron.d/crontab
RUN chmod 0644 /etc/cron.d/crontab && crontab /etc/cron.d/crontab
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
# Iniciar cron y ejecutar el script en segundo plano
CMD cron && tail -f /var/log/cron.log
# Dockerfile para scrapper
FROM python:3.11-slim AS scrapper
WORKDIR /app/scrapper
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY scrapper/ ./scrapper
CMD ["sh", "-c", "crond -f"]

View File

@ -4,10 +4,10 @@ from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
# Cargar variables del archivo .env
# Cargar variables del archivo .env (para entorno local)
load_dotenv()
# Configuración de MySQL
# Configuración de MySQL desde variables de entorno
MYSQL_HOST = os.getenv("MYSQL_HOST", "localhost")
MYSQL_USER = os.getenv("MYSQL_USER", "root")
MYSQL_PASSWORD = os.getenv("MYSQL_PASSWORD", "manabo")

View File

@ -4,6 +4,15 @@ from .database import get_db
from .models import NewsItem
from pydantic import BaseModel
from datetime import datetime
import logging
# Configuración del logging
LOG_FILE = "app.log"
logging.basicConfig(
filename=LOG_FILE, # Archivo de logs
level=logging.INFO, # Nivel de logging (DEBUG, INFO, WARNING, ERROR, CRITICAL)
format="%(asctime)s - %(levelname)s - %(message)s", # Formato de los logs
)
router = APIRouter()
@ -23,6 +32,7 @@ def create_news_item(item: NewsItemCreate, db: Session = Depends(get_db)):
# Verificar si el título ya existe
existing_item = db.query(NewsItem).filter(NewsItem.titulo == item.titulo).first()
if existing_item:
logging.info("Título ya en la base de datos")
raise HTTPException(status_code=400, detail="El título ya existe en la base de datos")
# Crear nuevo objeto

View File

@ -1,65 +1,63 @@
version: "3.9"
version: '3.8'
services:
app:
build: .
container_name: mi_scraper
container_name: fastapi_app
ports:
- "8000:8000"
environment:
- MYSQL_HOST=${MYSQL_HOST}
- MYSQL_USER=${MYSQL_USER}
- MYSQL_PASSWORD=${MYSQL_PASSWORD}
- MYSQL_DATABASE=${MYSQL_DATABASE}
- MYSQL_PORT=${MYSQL_PORT}
depends_on:
- mysql
environment:
- MYSQL_HOST=mysql
- MYSQL_USER=root
- MYSQL_PASSWORD=admin123
- MYSQL_DATABASE=scraper_db
volumes:
- ./app:/app
restart: always
networks:
- metanet1
- db
command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
mysql:
image: mysql:8
container_name: mi_mysql
db:
image: mysql:latest
container_name: mysql_db
restart: always
environment:
MYSQL_ROOT_PASSWORD: admin123
MYSQL_DATABASE: scraper_db
MYSQL_ROOT_PASSWORD: ${MYSQL_PASSWORD}
MYSQL_DATABASE: ${MYSQL_DATABASE}
MYSQL_USER: ${MYSQL_USER}
MYSQL_PASSWORD: ${MYSQL_PASSWORD}
ports:
- "3306:3306"
volumes:
- mysql_data:/var/lib/mysql
- ./init.sql:/docker-entrypoint-initdb.d/init.sql
networks:
- metanet1
scrapper:
build: .
container_name: scrapper
depends_on:
- app
environment:
- OLLAMA_URL=${OLLAMA_URL}
- OLLAMA_MODEL=${OLLAMA_MODEL}
command: ["sh", "-c", "crond -f"]
volumes:
- ./scrapper:/app/scrapper
- ./crontab.txt:/etc/crontabs/root
metabase:
image: metabase/metabase:latest
container_name: metabase
hostname: metabase
volumes:
- /dev/urandom:/dev/random:ro
ports:
- "3000:3000"
- "3100:3000"
environment:
MB_DB_TYPE: mysql
MB_DB_DBNAME: scraper_db
MB_DB_PORT: 3306
MB_DB_USER: root
MB_DB_PASS: admin123
MB_DB_HOST: mysql
networks:
- metanet1
MB_DB_DBNAME: ${MYSQL_DATABASE}
MB_DB_HOST: ${MYSQL_HOST}
MB_DB_PORT: ${MYSQL_PORT}
MB_DB_USER: ${MYSQL_USER}
MB_DB_PASS: ${MYSQL_PASSWORD}
depends_on:
- mysql
healthcheck:
test: curl --fail -I http://localhost:3000/api/health || exit 1
interval: 15s
timeout: 5s
retries: 5
networks:
metanet1:
driver: bridge
- db
volumes:
mysql_data:
mysql_data:

12
requirements.txt Normal file
View File

@ -0,0 +1,12 @@
fastapi
uvicorn
requests
beautifulsoup4
googlenewsdecoder
iacorrector
pytz
logging
sqlalchemy
pydantic
python-dotenv
mysql-connector-python

View File

@ -2,6 +2,15 @@ import json
import requests
import sys
from bs4 import BeautifulSoup
import logging
# Configuración del logging
LOG_FILE = "app.log"
logging.basicConfig(
filename=LOG_FILE, # Archivo de logs
level=logging.INFO, # Nivel de logging (DEBUG, INFO, WARNING, ERROR, CRITICAL)
format="%(asctime)s - %(levelname)s - %(message)s", # Formato de los logs
)
def download_html_as_human(url):
"""
@ -70,7 +79,7 @@ def get_author_from_url(url):
"""
html_content = download_html_as_human(url)
if not html_content:
print("error")
logging.info("error, no se pudo descargar la pagina")
return "No se pudo descargar la página."
soup = BeautifulSoup(html_content, 'html.parser')

View File

@ -1,66 +1,91 @@
import requests
import json
import os
import logging
# Configuración del logging
LOG_FILE = "app.log"
logging.basicConfig(
filename=LOG_FILE, # Archivo de logs
level=logging.INFO, # Nivel de logging (DEBUG, INFO, WARNING, ERROR, CRITICAL)
format="%(asctime)s - %(levelname)s - %(message)s", # Formato de los logs
)
# Obtener variables de entorno
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/generate")
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3")
def is_security_related(prompt):
url = "http://localhost:11434/api/generate"
logging.info(f"Checking if topic is security-related: {prompt}")
data = {
"model": "llama3",
"model": OLLAMA_MODEL,
"prompt": f"Does the following topic relate to national defense, armed forces, police, espionage, or intelligence? Answer only with 'true' or 'false'. Topic: {prompt}",
}
response = requests.post(url, json=data)
try:
# Dividir la respuesta en líneas y parsear cada una
response = requests.post(OLLAMA_URL, json=data)
response.raise_for_status() # Lanza una excepción si la solicitud falla
for line in response.text.strip().split("\n"):
json_data = json.loads(line)
if "response" in json_data and json_data["response"].strip():
return json_data["response"].strip().lower() == "true"
result = json_data["response"].strip().lower() == "true"
logging.info(f"Result for '{prompt}': {result}")
return result
except requests.RequestException as e:
logging.error(f"Request error: {e}")
except json.JSONDecodeError as e:
print("JSON Decode Error:", e)
logging.error(f"JSON Decode Error: {e}")
return False
def is_critico(prompt):
url = "http://localhost:11434/api/generate"
logging.info(f"Checking if topic is critical of security forces: {prompt}")
data = {
"model": "llama3",
"prompt": f"Does the following text critics the armed forces, security forces as Guardia Civil or Police, intelligence agencies such as CNI? Answer only with 'true' or 'false'. Topic: {prompt}",
"model": OLLAMA_MODEL,
"prompt": f"Does the following text criticizes the armed forces, security forces as Guardia Civil or Police, intelligence agencies such as CNI? Answer only with 'true' or 'false'. Topic: {prompt}",
}
response = requests.post(url, json=data)
try:
# Dividir la respuesta en líneas y parsear cada una
response = requests.post(OLLAMA_URL, json=data)
response.raise_for_status()
for line in response.text.strip().split("\n"):
json_data = json.loads(line)
if "response" in json_data and json_data["response"].strip():
return json_data["response"].strip().lower() == "true"
result = json_data["response"].strip().lower() == "true"
logging.info(f"Result for '{prompt}': {result}")
return result
except requests.RequestException as e:
logging.error(f"Request error: {e}")
except json.JSONDecodeError as e:
print("JSON Decode Error:", e)
logging.error(f"JSON Decode Error: {e}")
return False
def is_favorable(prompt):
url = "http://localhost:11434/api/generate"
logging.info(f"Checking if topic is favorable to security forces: {prompt}")
data = {
"model": "llama3",
"prompt": f"Does the following text favours the armed forces, security forces as Guardia Civil or Police, intelligence agencies such as CNI? Answer only with 'true' or 'false'. Topic: {prompt}",
"model": OLLAMA_MODEL,
"prompt": f"Does the following text favor the armed forces, security forces as Guardia Civil or Police, intelligence agencies such as CNI? Answer only with 'true' or 'false'. Topic: {prompt}",
}
response = requests.post(url, json=data)
try:
# Dividir la respuesta en líneas y parsear cada una
response = requests.post(OLLAMA_URL, json=data)
response.raise_for_status()
for line in response.text.strip().split("\n"):
json_data = json.loads(line)
if "response" in json_data and json_data["response"].strip():
return json_data["response"].strip().lower() == "true"
result = json_data["response"].strip().lower() == "true"
logging.info(f"Result for '{prompt}': {result}")
return result
except requests.RequestException as e:
logging.error(f"Request error: {e}")
except json.JSONDecodeError as e:
print("JSON Decode Error:", e)
logging.error(f"JSON Decode Error: {e}")
return False

View File

@ -1,14 +1,20 @@
import requests
from bs4 import BeautifulSoup
import json
import os
import time
import subprocess
from googlenewsdecoder import gnewsdecoder
from iacorrector import is_security_related, is_critico, is_favorable # Importa la función desde iacorrector.py
from datetime import datetime
import pytz
import logging
# Configuración del logging
LOG_FILE = "app.log"
logging.basicConfig(
filename=LOG_FILE, # Archivo de logs
level=logging.INFO, # Nivel de logging (DEBUG, INFO, WARNING, ERROR, CRITICAL)
format="%(asctime)s - %(levelname)s - %(message)s", # Formato de los logs
)
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
@ -23,7 +29,7 @@ def get_author_from_script(url):
author = result.stdout.strip()
return author if author else "Desconocido"
except Exception as e:
print(f"Error al obtener el autor para {url}: {e}")
logging.info(f"Error al obtener el autor para {url}: {e}")
return "Desconocido"
def get_url_from_google_news(url):
@ -36,7 +42,7 @@ def get_url_from_google_news(url):
else:
return "N/C"
except Exception as e:
print(f"Error occurred: {e}")
logging.info(f"Error occurred: {e}")
def get_article_content(url):
"""
@ -45,7 +51,7 @@ def get_article_content(url):
try:
response = requests.get(url, headers=HEADERS)
if response.status_code != 200:
print(f"Error al acceder a {url}: Código {response.status_code}")
logging.info(f"Error al acceder a {url}: Código {response.status_code}")
return "No se pudo obtener el contenido"
soup = BeautifulSoup(response.text, "html.parser")
@ -68,7 +74,7 @@ def get_article_content(url):
return "No se encontró contenido relevante"
except Exception as e:
print(f"Error al extraer contenido de {url}: {e}")
logging.info(f"Error al extraer contenido de {url}: {e}")
return "Error al extraer contenido"
def search_news(query):
@ -79,7 +85,7 @@ def search_news(query):
response = requests.get(base_url, headers=HEADERS)
if response.status_code != 200:
print(f"Error al acceder a la página para la consulta '{query}': {response.status_code}")
logging.info(f"Error al acceder a la página para la consulta '{query}': {response.status_code}")
return []
soup = BeautifulSoup(response.content, 'xml')
@ -123,19 +129,19 @@ def search_news(query):
insertar_datos(news_item)
except Exception as e:
print(f"Error al procesar un artículo para '{query}': {e}")
logging.info(f"Error al procesar un artículo para '{query}': {e}")
return news_list
def insertar_datos(news_item):
API_URL = "http://127.0.0.1:8001/news/"
API_URL = "http://localhost:8000/news/"
response = requests.post(API_URL, json=news_item)
if response.status_code == 200:
print(f"Noticia '{news_item['titulo']}' creada con éxito.")
logging.info(f"Noticia '{news_item['titulo']}' creada con éxito.")
else:
print(f"Error al insertar '{news_item['titulo']}':", response.status_code, response.json())
logging.info(f"Error al insertar '{news_item['titulo']}':", response.status_code, response.json())
def search_from_keywords_file():
"""
@ -151,14 +157,14 @@ def search_from_keywords_file():
keywords = [keyword.strip() for keyword in keywords]
for keyword in keywords:
print(f"\nBuscando noticias sobre: {keyword}")
logging.info(f"\nBuscando noticias sobre: {keyword}")
search_news(keyword)
time.sleep(2) # Pausa para evitar bloqueos por demasiadas solicitudes en poco tiempo
except FileNotFoundError:
print("No se encontró el archivo 'keywords.txt'.")
logging.info("No se encontró el archivo 'keywords.txt'.")
except Exception as e:
print(f"Error al leer el archivo 'keywords.txt': {e}")
logging.info(f"Error al leer el archivo 'keywords.txt': {e}")
# Ejecutar la búsqueda desde el archivo
search_from_keywords_file()