Commit 54ff6952 authored by marioromera's avatar marioromera

save game!

parent 11fb39c1
node_modules
web/bundle.js
web/*.txt
web/*.pdf
\ No newline at end of file
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (nlpBullshit)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/nlpBullshit.iml" filepath="$PROJECT_DIR$/.idea/nlpBullshit.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.7 (nlpBullshit)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="renderExternalDocumentation" value="true" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PySciProjectComponent">
<option name="PY_SCI_VIEW" value="true" />
<option name="PY_SCI_VIEW_SUGGESTED" value="true" />
</component>
</project>
\ No newline at end of file
# Course URL:
# https://deeplearningcourses.com/c/natural-language-processing-with-deep-learning-in-python
# https://udemy.com/natural-language-processing-with-deep-learning-in-python
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD, PCA, KernelPCA
from datetime import datetime
import os
import sys
sys.path.append(os.path.abspath('..'))
# from util import find_analogies
# from sklearn.feature_extraction.text import TfidfTransformer
def main():
analogies_to_try = (
('king', 'man', 'woman'),
('france', 'paris', 'london'),
('france', 'paris', 'rome'),
('paris', 'france', 'italy'),
)
### choose a data source ###
# sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500)
# sentences, word2idx = get_wikipedia_data(n_files=3, n_vocab=2000, by_paragraph=True)
# with open('tfidf_word2idx.json', 'w') as f:
# json.dump(word2idx, f)
notfound = False
for word_list in analogies_to_try:
for w in word_list:
if w not in word2idx:
print("%s not found in vocab, remove it from \
analogies to try or increase vocab size" % w)
notfound = True
if notfound:
exit()
# build term document matrix
V = len(word2idx)
N = len(sentences)
# create raw counts first
A = np.zeros((V, N))
print("V:", V, "N:", N)
j = 0
for sentence in sentences:
for i in sentence:
A[i, j] += 1
j += 1
print("finished getting raw counts")
transformer = TfidfTransformer()
A = transformer.fit_transform(A.T).T
# tsne requires a dense array
A = A.toarray()
# map back to word in plot
idx2word = {v: k for k, v in iteritems(word2idx)}
# plot the data in 2-D
tsne = TSNE()
Z = tsne.fit_transform(A)
plt.scatter(Z[:, 0], Z[:, 1])
for i in range(V):
try:
plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i, 0], Z[i, 1]))
except:
print("bad string:", idx2word[i])
plt.draw()
### multiple ways to create vectors for each word ###
# 1) simply set it to the TF-IDF matrix
# We = A
# 2) create a higher-D word embedding
tsne = TSNE(n_components=3)
We = tsne.fit_transform(A)
# 3) use a classic dimensionality reduction technique
# svd = KernelPCA(n_components=20, kernel='rbf')
# We = svd.fit_transform(A)
for word_list in analogies_to_try:
w1, w2, w3 = word_list
find_analogies(w1, w2, w3, We, word2idx, idx2word)
plt.show() # pause script until plot is closed
if __name__ == '__main__':
main()
{
"esversion": 9
}
Btext = `Afirmation" by Assata Shakur* ___ I believe in living. I believe in the spectrum of Beta days and Gamma people. I believe in sunshine. In windmills and waterfalls, tricycles and rocking chairs. And i believe that seeds grow into sprouts. And sprouts grow into trees. I believe in the magic of the hands. And in the wisdom of the eyes. I believe in rain and tears. And in the blood of in nity. DDD I believe in life. And i have seen the death parade march through the torso of the earth, sculpting mud bodies in its path. I have seen the destruction of the daylight, and seen bloodthirsty maggots prayed to and saluted. DDD I have seen the kind become the blind and the blind become the bind in one easy lesson. I have walked on cut glass. I have eaten crow and blunder bread and breathed the stench of indifference. DDD I have been locked by the lawless. Handcuffed by the haters. Gagged by the greedy. And, if i know any thing at all, it's that a wall is just a wall and nothing more at all. It can be broken down. DDD I believe in living. I believe in birth. I believe in the sweat of love and in the re of truth. DDD And i believe that a lost ship, steered by tired, seasick sailors, can still be guided home to port.`;
/* #### Generated By: http://www.cufonfonts.com #### */
@font-face {
font-family: 'Segoe UI Regular';
font-style: normal;
font-weight: normal;
src: local('Segoe UI Regular'), url('Segoe UI.woff') format('woff');
}
@font-face {
font-family: 'Segoe UI Italic';
font-style: normal;
font-weight: normal;
src: local('Segoe UI Italic'), url('Segoe UI Italic.woff') format('woff');
}
@font-face {
font-family: 'Segoe UI Bold';
font-style: normal;
font-weight: normal;
src: local('Segoe UI Bold'), url('Segoe UI Bold.woff') format('woff');
}
@font-face {
font-family: 'Segoe UI Bold Italic';
font-style: normal;
font-weight: normal;
src: local('Segoe UI Bold Italic'), url('Segoe UI Bold Italic.woff') format('woff');
}
\ No newline at end of file
.line-chart {
border: 1px solid lightgray;
}
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Document</title>
<link rel="stylesheet" type="text/css" href="./fonts/style.css" />
</head>
<body>
<html>
<head>
<link rel="stylesheet" href="index.css" />
</head>
<body>
<svg></svg>
<script src="./bundle.js"></script>
</body>
<style></style>
</html>
</body>
</html>
const d3 = require('d3');
const _ = require('lodash');
const tfidf = require('./tfidf');
const preprocess = require('./preprocess');
const tsnejs = require('tsne');
const fileName = 'Excerpt from: A thousand Plateaus';
const text = `A strange mystification: a book all the more total for being fragmented. At any rate, what a vapid idea, the book as the image of the world. In truth, it is not enough to say, "Long live the multiple," difficult as it is to raise that cry. No typographical, lexical, or even syntactical cleverness is enough to make it heard. The multiple must be made, not by always adding a higher dimension, but rather in the simplest of ways, by dint of sobriety, with the number of dimensions one already has available— always n - 1 (the only way the one belongs to the multiple: always subtracted). Subtract the unique from the multiplicity to be constituted; write at n - 1 dimensions. A system of this kind could be called a rhizome. A rhizome as subterranean stem is absolutely different from roots and radicles. Bulbs and tubers are rhizomes. Plants with roots or radicles may be rhizomorphic in other respects altogether: the question is whether plant life in its specificity is not entirely rhizomatic. Even some animals are, in their pack form. Rats are rhizomes. Burrows are too, in all of their func tions of shelter, supply, movement, evasion, and breakout.`;
var svgWidth = 600,
svgHeight = 600;
var margin = { top: 40, right: 120, bottom: 140, left: 50 };
var width = svgWidth - margin.left - margin.right;
var height = svgHeight - margin.top - margin.bottom;
var svg = d3.select('svg').attr('width', svgWidth).attr('height', svgHeight);
const p = document.createElement('p');
p.setAttribute('class', 'text');
p.innerHTML = text;
document.querySelector('body').append(p);
const cleanText = preprocess(text);
const resultTfidf = tfidf(cleanText);
const rawTfidf = _.cloneDeep(_.shuffle(_.values(resultTfidf)));
var opt = {};
opt.epsilon = 1000; // epsilon is learning rate (10 = default)
opt.perplexity = 1340; // roughly how many neighbors each point influences (30 = default)
opt.dim = 2;
var tsne = new tsnejs.tSNE(opt);
tsne.initDataRaw(rawTfidf);
tsne.step();
var Y = tsne.getSolution(); // Y is an array of 2-D points that you can plot
console.log(Y);
const data = _.keys(resultTfidf).map((p, i) => ({
word: p,
doc: fileName,
points: [Y[i]],
}));
var color = d3
.scaleSequential()
.domain([1, data.length])
.interpolator(d3.interpolateRainbow);
var g = svg
.append('g')
.attr('transform', 'translate(' + margin.left + ',' + margin.top + ')');
svg
.append('g')
.append('text')
.attr('fill', '#000')
.attr('y', 14)
.attr('x', 1)
.attr('text-anchor', 'center')
.text(fileName);
var x = d3.scaleLinear().range([0, width]);
var y = d3.scaleLinear().range([height, 0]);
x.domain([-1, 1]); //use just the x part
y.domain([-1, 1]); // use just the y part
g.append('g').call(d3.axisLeft(y));
g.append('g')
.attr('transform', 'translate(0,' + height + ')')
.call(d3.axisBottom(x))
.selectAll('text')
.style('text-anchor', 'end')
.attr('transform', 'rotate(-65)');
const points = g.append('g').attr('class', 'points');
var line = d3
.line()
.x((d) => x(d[0])) // apply the x scale to the x data
.y((d) => y(d[1])) // apply the y scale to the y data
.curve(d3.curveMonotoneX);
function resetDomains() {
const xCords = _.flattenDeep(data.map((d) => d.points.map((p) => p[0])));
const yCords = _.flattenDeep(data.map((d) => d.points.map((p) => p[1])));
x.domain(d3.extent(xCords)); //use just the x part
y.domain(d3.extent(yCords)); // use just the y part
}
let iter = 0;
function makeStep() {
points.selectAll('path').remove();
points.selectAll('circle').remove();
points.selectAll('text').remove();
if (iter > 250) {
clearInterval(embeddingInterval);
window.alert('Finish');
}
tsne.step();
Y = _.cloneDeep(tsne.getSolution());
Y.forEach((point, i) => {
data[i].points.push(point);
resetDomains();
const paths = points
.append('circle')
.attr('class', `point ${data[i].word}`)
.attr('cx', x(point[0]))
.attr('cy', y(point[1]))
.attr('stroke', color(i))
.attr('stroke-width', 1)
.attr('r', '3')
.attr('fill', 'none');
points
.append('path')
.attr('class', `line ${data[i].word}`)
.attr('d', line(data[i].points))
.attr('stroke', color(i))
.attr('opacity', 0.1)
.attr('stroke-width', 1)
.attr('fill', 'none');
points
.append('text')
.attr('class', `text ${data[i].word}`)
.attr('x', x(point[0]) - 7)
.attr('y', y(point[1]) - 7)
.attr('font-family', 'Segoe UI Italic') // Font type
.attr('font-size', '14px') // Font size
.attr('fill', color(i))
.text(data[i].word);
});
iter++;
}
const embeddingInterval = setInterval(makeStep, 0);
This diff is collapsed.
{
"name": "web",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"@tensorflow/tfjs-tsne": "^0.2.0",
"d3": "^5.16.0",
"fs": "0.0.1-security",
"lodash": "^4.17.15",
"tsne": "^1.0.1",
"tsne-js": "^1.0.3"
}
}
const stopWords = require('./stop-words');
const _ = require('lodash');
function preprocess(text) {
return _.flow(
(t) => {
return _.toLower(t);
},
(t) => {
return _.replace(
t,
/(?:(?:https?|ftp):\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:[/?#](?:\S*[^\s!"'()*,-.:;<>?\[\]_`{|}~]|))?/gi,
''
);
},
(t) => {
return _.replace(t, /\b\w{1,2}\b/gi, '');
},
(text) => {
var tokens = text.match(
/(?<=\s+|^)[\"\'\‘\“\'\"\[\(\{\⟨](.*?[.?!])(\s[.?!])*[\"\'\’\”\'\"\]\)\}\⟩](?=\s+|$)|(?<=\s+|^)\S(.*?[.?!])(\s[.?!])*(?=\s+|$)/g
);
if (!tokens) {
return [text];
}
// remove unecessary white space
tokens = tokens.map(trim);
return trim(tokens);
},
(sentences) => {
return _.map(sentences, (s) => wordTokenizer(s));
},
(sentences) => {
return _.map(sentences, (wds) => {
{
return _.filter(wds, (w) => {
return stopWords.indexOf(w) < 0;
});
}
});
}
)(text);
}
function wordTokenizer(s) {
var results;
const _pattern = /[^A-Za-zА-Яа-я0-9_]+/;
results = s.split(_pattern);
return _.without(results, '', ' ');
}
function trim(array) {
while (array[array.length - 1] == '') array.pop();
while (array[0] == '') array.shift();
return array;
}
module.exports = preprocess;
const nltkStopWords = [
'i',
'me',
'my',
'myself',
'we',
'our',
'ours',
'ourselves',
'you',
"you're",
"you've",
"you'll",
"you'd",
'your',
'yours',
'yourself',
'yourselves',
'he',
'him',
'his',
'himself',
'she',
"she's",
'her',
'hers',
'herself',
'it',
"it's",
'its',
'itself',
'they',
'them',
'their',
'theirs',
'themselves',
'what',
'which',
'who',
'whom',
'this',
'that',
"that'll",
'these',
'those',
'am',
'is',
'are',
'was',
'were',
'be',
'been',
'being',
'have',
'has',
'had',
'having',
'do',
'does',
'did',
'doing',
'a',
'an',
'the',
'and',
'but',
'if',
'or',
'because',
'as',
'until',
'while',
'of',
'at',
'by',
'for',
'with',
'about',
'against',
'between',
'into',
'through',
'during',
'before',
'after',
'above',
'below',
'to',
'from',
'up',
'down',
'in',
'out',
'on',
'off',
'over',
'under',
'again',
'further',
'then',
'once',
'here',
'there',
'when',
'where',
'why',
'how',
'all',
'any',
'both',
'each',
'few',
'more',
'most',
'other',
'some',
'such',
'no',
'nor',
'not',
'only',
'own',
'same',
'so',
'than',
'too',
'very',
's',
't',
'can',
'will',
'just',
'don',
"don't",
'should',
"should've",
'now',
'd',
'll',
'm',
'o',
're',
've',
'y',
'ain',
'aren',
"aren't",
'couldn',
"couldn't",
'didn',