Commit 0e32faa0 authored by marioromera's avatar marioromera
Browse files

Save each 1/3 percentage

parent 6df93a63
......@@ -8,6 +8,15 @@ from textacy import preprocessing
import textacy.vsm
import csv
# Input word to search for, in the future can be a list
searched_words = ["google", "amazon", "netflix", "facebook", "microsoft", "apple", "vodafone", "alibaba",
"tesla", "5g",
"twitter", "telefónica", "indra",
"airbnb", "tinder", "expedia", "bloomberg", "paypal", "baidu", "smart city"]
# Folder from where files will be loaded
pdfs_folder_path = "./downloaded_files/texts"
doc_bin = DocBin(store_user_data=True)
recover_files_path = "./recover_files"
if not os.path.exists(recover_files_path):
......@@ -24,15 +33,6 @@ for (dirpath, dirnames, filenames) in os.walk(docs_path):
for idx, filename in enumerate(filenames):
doc_bin.add(spacy.tokens.Doc(nlp_vocab).from_disk(f"{docs_path}/{filename}"))
# Input word to search for, in the future can be a list
searched_words = ["google", "amazon", "netflix", "facebook", "microsoft", "apple", "vodafone", "alibaba",
"tesla", "5g",
"twitter", "telefónica", "indra",
"airbnb", "tinder", "expedia", "bloomberg", "paypal", "baidu", "smart city"]
# Folder from where files will be loaded
pdfs_folder_path = "./downloaded_files/texts_to_debug"
spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS
# Spacy library with spanish language loaded
......@@ -62,12 +62,13 @@ def clean_text(txt):
docs_length = len(list(doc_bin.get_docs(nlp.vocab)))
print(f"Docs added: {docs_length}")
print(f"Docs recovered: {docs_length}")
# Iterate over files in folder
for (dirpath, dirnames, filenames) in os.walk(pdfs_folder_path):
for idx, filename in enumerate(filenames):
# TODO: Find a better way to know if its recovered
if idx < docs_length:
continue
......@@ -87,11 +88,16 @@ for (dirpath, dirnames, filenames) in os.walk(pdfs_folder_path):
print(filename, doc._.preview)
doc_bin.add(doc)
if idx > len(filenames) / 2 or idx == len(filenames) - 1:
# Save each 10% or at the end
if (idx % int(len(filenames) / 8)) == 0 or idx == len(filenames) - 1:
print("Saving", idx)
# TODO: Optimize to save from index
# from_idx = abs(len(filenames) - len(filenames) - idx)
save_game()
print(f"Adding doc to corpus")
print(f"Adding doc to corpus")
# Split documents into tokens(units of text, commonly known as words)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment