Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
marioromera
digital_capitalism
Commits
0e32faa0
Commit
0e32faa0
authored
May 27, 2020
by
marioromera
Browse files
Save each 1/3 percentage
parent
6df93a63
Changes
1
Hide whitespace changes
Inline
Side-by-side
text_analyzer.py
View file @
0e32faa0
...
...
@@ -8,6 +8,15 @@ from textacy import preprocessing
import
textacy.vsm
import
csv
# Input word to search for, in the future can be a list
searched_words
=
[
"google"
,
"amazon"
,
"netflix"
,
"facebook"
,
"microsoft"
,
"apple"
,
"vodafone"
,
"alibaba"
,
"tesla"
,
"5g"
,
"twitter"
,
"telefónica"
,
"indra"
,
"airbnb"
,
"tinder"
,
"expedia"
,
"bloomberg"
,
"paypal"
,
"baidu"
,
"smart city"
]
# Folder from where files will be loaded
pdfs_folder_path
=
"./downloaded_files/texts"
doc_bin
=
DocBin
(
store_user_data
=
True
)
recover_files_path
=
"./recover_files"
if
not
os
.
path
.
exists
(
recover_files_path
):
...
...
@@ -24,15 +33,6 @@ for (dirpath, dirnames, filenames) in os.walk(docs_path):
for
idx
,
filename
in
enumerate
(
filenames
):
doc_bin
.
add
(
spacy
.
tokens
.
Doc
(
nlp_vocab
).
from_disk
(
f
"
{
docs_path
}
/
{
filename
}
"
))
# Input word to search for, in the future can be a list
searched_words
=
[
"google"
,
"amazon"
,
"netflix"
,
"facebook"
,
"microsoft"
,
"apple"
,
"vodafone"
,
"alibaba"
,
"tesla"
,
"5g"
,
"twitter"
,
"telefónica"
,
"indra"
,
"airbnb"
,
"tinder"
,
"expedia"
,
"bloomberg"
,
"paypal"
,
"baidu"
,
"smart city"
]
# Folder from where files will be loaded
pdfs_folder_path
=
"./downloaded_files/texts_to_debug"
spacy_stopwords
=
spacy
.
lang
.
es
.
stop_words
.
STOP_WORDS
# Spacy library with spanish language loaded
...
...
@@ -62,12 +62,13 @@ def clean_text(txt):
docs_length
=
len
(
list
(
doc_bin
.
get_docs
(
nlp
.
vocab
)))
print
(
f
"Docs
add
ed:
{
docs_length
}
"
)
print
(
f
"Docs
recover
ed:
{
docs_length
}
"
)
# Iterate over files in folder
for
(
dirpath
,
dirnames
,
filenames
)
in
os
.
walk
(
pdfs_folder_path
):
for
idx
,
filename
in
enumerate
(
filenames
):
# TODO: Find a better way to know if its recovered
if
idx
<
docs_length
:
continue
...
...
@@ -87,11 +88,16 @@ for (dirpath, dirnames, filenames) in os.walk(pdfs_folder_path):
print
(
filename
,
doc
.
_
.
preview
)
doc_bin
.
add
(
doc
)
if
idx
>
len
(
filenames
)
/
2
or
idx
==
len
(
filenames
)
-
1
:
# Save each 10% or at the end
if
(
idx
%
int
(
len
(
filenames
)
/
8
))
==
0
or
idx
==
len
(
filenames
)
-
1
:
print
(
"Saving"
,
idx
)
# TODO: Optimize to save from index
# from_idx = abs(len(filenames) - len(filenames) - idx)
save_game
()
print
(
f
"Adding doc to corpus"
)
print
(
f
"Adding doc to corpus"
)
# Split documents into tokens(units of text, commonly known as words)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment