Commit 294e520b authored by marioromera's avatar marioromera
Browse files

Add documentation

parent cd0ead6c
Pipeline #704 canceled with stages
.idea
downloaded_files
\ No newline at end of file
downloaded_files
results.csv
\ No newline at end of file
Aunq el codigo funciona, todavia quedaría muchas tecnicas por explorar y optimizar, aun así los resultados no son tan enriquecedores, por las siguientes razones:
-Requiere bajarse mogollon de archivos
-Hace falta bastante energia para procesar tanto
\ No newline at end of file
-Hace falta bastante energia para procesar tanto
Para hacerlo funcionar:
1. instalar con `pip install -r requirements.txt`
2. ejectuar `python get_files_boe.py` (dentro de ese archivo podeis cambiar el rango de fechas)
3. ejecutar `python text_analyzer.py` (ahi estan hard-coded las palabras clave q se buscaran, cambiarlas si os parece)
Hay 300.000.000.000 boes por día, os sugiero q probeis con un rango pequeño y o canceleis el get_files_boe.py (Ctrl + c)
y luego ejecuteis el analizador
\ No newline at end of file
......@@ -5,6 +5,10 @@ from os import walk
from datetime import timedelta, date
import io
# Input dates to download all files published in between
start_date = date(2020, 5, 25)
end_date = date(2020, 5, 28)
# Returns an array containing all dates in between start and end
def daterange(start, end):
......@@ -12,9 +16,6 @@ def daterange(start, end):
yield start + timedelta(n)
# Input dates
start_date = date(2020, 5, 25)
end_date = date(2020, 5, 28)
print(f"Searching from: {start_date}, to: {end_date}")
# Fixed urls to download content
......
###### Requirements without Version Specifiers ######
beautifulsoup4~=4.9.0
requests~=2.23.0
bs4~=4.9.0
spacy~=2.2.4
textacy~=0.10.0
\ No newline at end of file
......@@ -4,6 +4,13 @@ from spacy.lang.es import Spanish
from textacy import preprocessing
import textacy.vsm
import csv
# Input words to search for
searched_words = ["google", "amazon", "netflix", "facebook", "microsoft", "apple", "vodafone", "alibaba",
"tesla",
"twitter", "telefónica", "indra",
"airbnb", "tinder", "expedia", "bloomberg", "paypal", "baidu", "smart city"]
# Folder from where files will be loaded
pdfs_folder_path = "./downloaded_files/texts"
......@@ -15,7 +22,8 @@ nlp = Spanish()
def clean_text(txt):
clean = preprocessing.replace_urls(preprocessing.normalize_whitespace(preprocessing.remove_punctuation(txt)))
clean = preprocessing.remove_accents(
preprocessing.replace_urls(preprocessing.normalize_whitespace(preprocessing.remove_punctuation(txt))))
clean = [tok.text for tok in nlp.tokenizer(clean) if tok.is_stop or len(tok.text) > 2]
return " ".join(clean)
......@@ -52,27 +60,35 @@ def get_tokenize_docs(documents):
for d in documents]
# Input word to search for, in the future can be a list
searched_word = "google".lower()
print(f"Finding {searched_word} needle in the haystack")
# Creates a matrice of all the terms in the documents, first creates the vector then fill it with vocabulary
vectorizer = textacy.vsm.Vectorizer(apply_idf=False, norm=None)
doc_term_matrix = vectorizer.fit_transform(get_tokenize_docs(corpus))
# -1 means not found
searched_word_index = -1
# Check if index of term exists in vector
try:
searched_word_index = vectorizer.terms_list.index(searched_word)
except:
print("Word not found")
# Word is in corpus
if searched_word_index != -1:
print("vocab", searched_word_index)
tf = textacy.vsm.matrix_utils.get_term_freqs(doc_term_matrix, type_="log")[searched_word_index]
print("tf", tf)
idf = textacy.vsm.matrix_utils.get_inverse_doc_freqs(doc_term_matrix)[searched_word_index]
print("idf", idf)
print("tf-idf", tf * idf)
with open('results.csv', 'w', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["word", "tf", "idf", "tfidf"])
for searched_word in searched_words:
searched_word = searched_word.lower()
print(f"Finding {searched_word} needle in the haystack")
# -1 means not found
searched_word_index = -1
# Check if index of term exists in vector
try:
searched_word_index = vectorizer.terms_list.index(searched_word)
except:
print("Word not found")
# Word is in corpus
if searched_word_index != -1:
print("Number of words in corpus", len(vectorizer.terms_list))
tf = textacy.vsm.matrix_utils.get_term_freqs(doc_term_matrix)[searched_word_index]
print("tf", tf)
idf = textacy.vsm.matrix_utils.get_inverse_doc_freqs(doc_term_matrix)[searched_word_index]
print("idf", idf)
tfidf = tf * idf
print("tf-idf", tfidf)
writer.writerow([searched_word, tf, idf, tfidf])
else:
writer.writerow([searched_word, "", "", ""])
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment