Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
marioromera
digital_capitalism
Commits
6df93a63
Commit
6df93a63
authored
May 27, 2020
by
marioromera
Browse files
Enable saving and increase document limit
parent
e233e7f9
Changes
2
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
6df93a63
.idea
downloaded_files
recover_files
results.csv
\ No newline at end of file
text_analyzer.py
View file @
6df93a63
import
glob
import
os
import
spacy
from
spacy.lang.es
import
Spanish
from
spacy.tokens
import
DocBin
import
textacy
from
textacy
import
preprocessing
import
textacy.vsm
import
csv
doc_bin
=
DocBin
(
store_user_data
=
True
)
recover_files_path
=
"./recover_files"
if
not
os
.
path
.
exists
(
recover_files_path
):
os
.
makedirs
(
recover_files_path
)
docs_path
=
f
'
{
recover_files_path
}
/docs'
if
not
os
.
path
.
exists
(
docs_path
):
os
.
makedirs
(
docs_path
)
# Deserialization
nlp_vocab
=
spacy
.
vocab
.
Vocab
().
from_disk
(
f
'
{
recover_files_path
}
/vocab'
)
for
(
dirpath
,
dirnames
,
filenames
)
in
os
.
walk
(
docs_path
):
for
idx
,
filename
in
enumerate
(
filenames
):
doc_bin
.
add
(
spacy
.
tokens
.
Doc
(
nlp_vocab
).
from_disk
(
f
"
{
docs_path
}
/
{
filename
}
"
))
# Input word to search for, in the future can be a list
searched_words
=
[
"google"
,
"amazon"
,
"netflix"
,
"facebook"
,
"microsoft"
,
"apple"
,
"vodafone"
,
"alibaba"
,
"tesla"
,
"tesla"
,
"5g"
,
"twitter"
,
"telefónica"
,
"indra"
,
"airbnb"
,
"tinder"
,
"expedia"
,
"bloomberg"
,
"paypal"
,
"baidu"
,
"smart city"
]
# Folder from where files will be loaded
pdfs_folder_path
=
"./downloaded_files/texts"
pdfs_folder_path
=
"./downloaded_files/texts
_to_debug
"
spacy_stopwords
=
spacy
.
lang
.
es
.
stop_words
.
STOP_WORDS
# Spacy library with spanish language loaded
nlp
=
Spanish
()
nlp
=
Spanish
(
disable
=
(
"ner"
,
"tagger"
,
"parser"
))
# Got error with a text this size: erred_text_size + margin=50000
nlp
.
max_length
=
4448803
+
5000
# Serialization
def
save_game
():
# Serialize vocab
nlp
.
vocab
.
to_disk
(
f
'
{
recover_files_path
}
/vocab'
)
for
i
,
doc_to_save
in
enumerate
(
doc_bin
.
get_docs
(
nlp
.
vocab
)):
print
(
"Saving "
,
doc_to_save
.
_
.
preview
)
doc_to_save
.
to_disk
(
f
'
{
recover_files_path
}
/docs/
{
i
}
.txt'
)
print
(
"Saved"
)
def
clean_text
(
txt
):
clean
=
preprocessing
.
remove_accents
(
preprocessing
.
replace_urls
(
preprocessing
.
normalize_whitespace
(
preprocessing
.
remove_punctuation
(
txt
))))
preprocessing
.
replace_urls
(
preprocessing
.
normalize_whitespace
(
preprocessing
.
remove_punctuation
(
txt
))))
clean
=
[
tok
.
text
for
tok
in
nlp
.
tokenizer
(
clean
)
if
tok
.
is_stop
or
len
(
tok
.
text
)
>
2
]
return
" "
.
join
(
clean
)
corpus
=
[]
docs_length
=
len
(
list
(
doc_bin
.
get_docs
(
nlp
.
vocab
)))
print
(
f
"Docs added:
{
docs_length
}
"
)
# Iterate over files in folder
for
(
dirpath
,
dirnames
,
filenames
)
in
os
.
walk
(
pdfs_folder_path
):
for
filename
in
filenames
:
for
idx
,
filename
in
enumerate
(
filenames
):
if
idx
<
docs_length
:
continue
print
(
f
"Opening
{
filename
}
"
)
file
=
open
(
f
"
{
pdfs_folder_path
}
/
{
filename
}
"
,
"r"
,
encoding
=
"utf-8"
)
text
=
file
.
read
()
...
...
@@ -43,14 +82,16 @@ for (dirpath, dirnames, filenames) in os.walk(pdfs_folder_path):
date
=
filename
[:
len
(
"YYYYMMDD"
)]
# Creates a spacy doc out of each text
# TODO: Find out if its possible to just use nlp and then add it
doc
=
text
acy
.
make_spacy_doc
((
preprocessed
,
{
"date"
:
date
}),
"es"
)
doc
=
nlp
(
text
)
# Shows an overview of document
print
(
filename
,
doc
.
_
.
preview
)
print
(
f
"Adding doc to corpus"
)
corpus
.
append
(
doc
)
doc_bin
.
add
(
doc
)
if
idx
>
len
(
filenames
)
/
2
or
idx
==
len
(
filenames
)
-
1
:
print
(
"Saving"
,
idx
)
save_game
()
print
(
f
"A
ll docs added:
{
len
(
corpus
)
}
"
)
print
(
f
"A
dding doc to
corpus"
)
# Split documents into tokens(units of text, commonly known as words)
...
...
@@ -62,7 +103,7 @@ def get_tokenize_docs(documents):
# Creates a matrice of all the terms in the documents, first creates the vector then fill it with vocabulary
vectorizer
=
textacy
.
vsm
.
Vectorizer
(
apply_idf
=
False
,
norm
=
None
)
doc_term_matrix
=
vectorizer
.
fit_transform
(
get_tokenize_docs
(
corpus
))
doc_term_matrix
=
vectorizer
.
fit_transform
(
get_tokenize_docs
(
doc_bin
.
get_docs
(
nlp
.
vocab
)
))
with
open
(
'results.csv'
,
'w'
,
encoding
=
'utf-8'
)
as
csvfile
:
writer
=
csv
.
writer
(
csvfile
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment