Commit 6df93a63 authored by marioromera's avatar marioromera
Browse files

Enable saving and increase document limit

parent e233e7f9
.idea
downloaded_files
recover_files
results.csv
\ No newline at end of file
import glob
import os
import spacy
from spacy.lang.es import Spanish
from spacy.tokens import DocBin
import textacy
from textacy import preprocessing
import textacy.vsm
import csv
doc_bin = DocBin(store_user_data=True)
recover_files_path = "./recover_files"
if not os.path.exists(recover_files_path):
os.makedirs(recover_files_path)
docs_path = f'{recover_files_path}/docs'
if not os.path.exists(docs_path):
os.makedirs(docs_path)
# Deserialization
nlp_vocab = spacy.vocab.Vocab().from_disk(f'{recover_files_path}/vocab')
for (dirpath, dirnames, filenames) in os.walk(docs_path):
for idx, filename in enumerate(filenames):
doc_bin.add(spacy.tokens.Doc(nlp_vocab).from_disk(f"{docs_path}/{filename}"))
# Input word to search for, in the future can be a list
searched_words = ["google", "amazon", "netflix", "facebook", "microsoft", "apple", "vodafone", "alibaba",
"tesla",
"tesla", "5g",
"twitter", "telefónica", "indra",
"airbnb", "tinder", "expedia", "bloomberg", "paypal", "baidu", "smart city"]
# Folder from where files will be loaded
pdfs_folder_path = "./downloaded_files/texts"
pdfs_folder_path = "./downloaded_files/texts_to_debug"
spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS
# Spacy library with spanish language loaded
nlp = Spanish()
nlp = Spanish(disable=("ner", "tagger", "parser"))
# Got error with a text this size: erred_text_size + margin=50000
nlp.max_length = 4448803 + 5000
# Serialization
def save_game():
# Serialize vocab
nlp.vocab.to_disk(f'{recover_files_path}/vocab')
for i, doc_to_save in enumerate(doc_bin.get_docs(nlp.vocab)):
print("Saving ", doc_to_save._.preview)
doc_to_save.to_disk(f'{recover_files_path}/docs/{i}.txt')
print("Saved")
def clean_text(txt):
clean = preprocessing.remove_accents(
preprocessing.replace_urls(preprocessing.normalize_whitespace(preprocessing.remove_punctuation(txt))))
preprocessing.replace_urls(
preprocessing.normalize_whitespace(preprocessing.remove_punctuation(txt))))
clean = [tok.text for tok in nlp.tokenizer(clean) if tok.is_stop or len(tok.text) > 2]
return " ".join(clean)
corpus = []
docs_length = len(list(doc_bin.get_docs(nlp.vocab)))
print(f"Docs added: {docs_length}")
# Iterate over files in folder
for (dirpath, dirnames, filenames) in os.walk(pdfs_folder_path):
for filename in filenames:
for idx, filename in enumerate(filenames):
if idx < docs_length:
continue
print(f"Opening {filename}")
file = open(f"{pdfs_folder_path}/{filename}", "r", encoding="utf-8")
text = file.read()
......@@ -43,14 +82,16 @@ for (dirpath, dirnames, filenames) in os.walk(pdfs_folder_path):
date = filename[:len("YYYYMMDD")]
# Creates a spacy doc out of each text
# TODO: Find out if its possible to just use nlp and then add it
doc = textacy.make_spacy_doc((preprocessed, {"date": date}), "es")
doc = nlp(text)
# Shows an overview of document
print(filename, doc._.preview)
print(f"Adding doc to corpus")
corpus.append(doc)
doc_bin.add(doc)
if idx > len(filenames) / 2 or idx == len(filenames) - 1:
print("Saving", idx)
save_game()
print(f"All docs added: {len(corpus)}")
print(f"Adding doc to corpus")
# Split documents into tokens(units of text, commonly known as words)
......@@ -62,7 +103,7 @@ def get_tokenize_docs(documents):
# Creates a matrice of all the terms in the documents, first creates the vector then fill it with vocabulary
vectorizer = textacy.vsm.Vectorizer(apply_idf=False, norm=None)
doc_term_matrix = vectorizer.fit_transform(get_tokenize_docs(corpus))
doc_term_matrix = vectorizer.fit_transform(get_tokenize_docs(doc_bin.get_docs(nlp.vocab)))
with open('results.csv', 'w', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment