Commit 7a45fdca authored by marioromera's avatar marioromera
Browse files

add license and correct start without recovery

parent 0e32faa0
#
# The MIT License (MIT)
#
# Copyright (c) 2020 Mario Romera
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE
#
Aunq el codigo funciona, todavia quedaría muchas tecnicas por explorar y optimizar, aun así los resultados no son tan enriquecedores, por las siguientes razones:
-Requiere bajarse mogollon de archivos
-Hace falta bastante energia para procesar tanto
......
#
# The MIT License (MIT)
#
# Copyright (c) 2020 Mario Romera
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE
#
import requests
from bs4 import BeautifulSoup
import os
......@@ -60,7 +84,7 @@ for single_date in daterange(start_date, end_date):
if len(summary_content):
print(f"Creating summary file {summary_filepath}")
# if doesn't exists creates the file, otherwise write to it
f = open(summary_filepath, "w+")
f = open(summary_filepath, "w+", "utf-8")
f.write(summary_content)
f.close()
......@@ -69,7 +93,7 @@ for (dirpath, dirnames, filenames) in walk(summaries_folder):
filepath = summaries_folder + "/" + filename
print("Opening " + filepath)
summary = open(filepath, "r")
summary = open(filepath, "r", "utf-8")
# Parse doc to allow queries
parsed_summary = BeautifulSoup(summary, 'xml')
......
#
# The MIT License (MIT)
#
# Copyright (c) 2020 Mario Romera
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE
#
import glob
import os
import spacy
......@@ -9,10 +33,32 @@ import textacy.vsm
import csv
# Input word to search for, in the future can be a list
searched_words = ["google", "amazon", "netflix", "facebook", "microsoft", "apple", "vodafone", "alibaba",
"tesla", "5g",
"twitter", "telefónica", "indra",
"airbnb", "tinder", "expedia", "bloomberg", "paypal", "baidu", "smart city"]
searched_words = [
"google",
"amazon",
"netflix",
"facebook",
"microsoft",
"apple",
"vodafone",
"alibaba",
"tesla",
"twitter",
"telefónica",
"indra",
"airbnb",
"tinder",
"expedia",
"bloomberg",
"paypal",
"baidu",
"huawei",
"oracle",
"amazon web service",
"ibm",
"5g",
"ciberseguridad",
"smart city"]
# Folder from where files will be loaded
pdfs_folder_path = "./downloaded_files/texts"
......@@ -21,6 +67,7 @@ doc_bin = DocBin(store_user_data=True)
recover_files_path = "./recover_files"
if not os.path.exists(recover_files_path):
os.makedirs(recover_files_path)
spacy.load("es").vocab.to_disk(f'{recover_files_path}/vocab')
docs_path = f'{recover_files_path}/docs'
if not os.path.exists(docs_path):
......@@ -82,7 +129,6 @@ for (dirpath, dirnames, filenames) in os.walk(pdfs_folder_path):
print(f"Making spacy doc")
date = filename[:len("YYYYMMDD")]
# Creates a spacy doc out of each text
# TODO: Find out if its possible to just use nlp and then add it
doc = nlp(text)
# Shows an overview of document
print(filename, doc._.preview)
......@@ -139,3 +185,4 @@ with open('results.csv', 'w', encoding='utf-8') as csvfile:
writer.writerow([searched_word, tf, idf, tfidf])
else:
writer.writerow([searched_word, "", "", ""])
print("Analysis finished")
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment