python 在python中通过线性回归分析文本
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 在python中通过线性回归分析文本相关的知识,希望对你有一定的参考价值。
def regulate_dimension(list_of_signals):
# translate numeric signals into signals between 0 and 1
d_offset = -min(list_of_signals)
d_range = max(list_of_signals)-min(list_of_signals)
d_scale = 1.0/d_range
return [((d_offset+i)*d_scale) for i in list_of_signals]
def regulate_signals(list_of_tuples):
out = []
for dimension in range(len(list_of_tuples[0])):
# returns the values for the dimension
d = [ i[dimension] for i in list_of_tuples ]
out.append(regulate_dimension(d))
return out
def str_to_signals(s):
# converts characters in a string to floats in a list
return [float(ord(c)) for c in s]
return regulate_dimension([ float(ord(c)) for c in s ])
# -*- coding: utf-8 -*-
# @Author: cody
# @Date: 2016-09-23 00:03:16
# @Last Modified 2016-10-02
# @Last Modified time: 2016-10-02 18:54:41
from str_to_signals import str_to_signals, regulate_dimension
def linear_regression(x,y=[]):
if len(y) < 1:
x,y = range(len(x)),x
n = len(x)
X = sum(x)
Y = sum(y)
A = sum([xi**2 for xi in x])
C = sum([xi*yi for xi, yi in zip(x,y)])
D = X**2 - n*A
slope = (X*Y - n*C) / (D)
y_intercept = (C*X - A*Y) / (D)
return (slope, y_intercept)
# data that can be harvested from a string
# length
# total
# regression
# di
# runs regression against a string
def str_regress(s): return linear_regression(str_to_signals(s))
# generates a list of regressions for every 3 char substring in a string
def char_to_char_regression(s):
assert len(s)>=3, exit('need more letters than {}'.format(s))
if len(s) > 3:
previous = ''
for c in s:
previous+=c
if len(previous) is 3:
tmp=previous+c
yield str_regress(tmp)
previous = previous[1:]
else:
yield str_regress(s)
# joins a list of regression values with MORE REGRESSION!!!
def join_regression_models(models):
x = [ z[0] for z in models ]
y = [ z[1] for z in models ]
avg = lambda i: sum(i)/float(len(i))
#return avg(x), avg(y)
return linear_regression(x, y)
# runs regression over list of strings and returns its joined regression
def str_list_regression(strings):
assert len(strings)>1, exit('str_list_regression needs more strings than {}'.format(strings))
return join_regression_models([str_regress(s) for s in strings])
# joins tuples into one large tuple
def join_tuples(list_of_tuples):
o = []
for t in list_of_tuples:
if isinstance(t, tuple):
for i in t:
o.append(i)
else:
o.append(t)
return(tuple(o))
# we could generate three fingerprints from the word
# - whole regression
# - regression of first half + second half
def harvest_str_signals(s):
signals = str_to_signals(s)
# this determines how long beginning_regression and end_regression are
half_length = int(round(len(s)/2.0, 0))
# where the models will be collected
models = []
# regression of the whole word
models.append(str_regress(s)[0])
# regression for every 3 char substring in s
#models.append(join_regression_models( model for model in char_to_char_regression(s) ))
# length of s
models.append(float(len(s)))
return join_tuples(models)
def test_word(w):
print 'testing:',w
print "full regression - {}".format(harvest_str_signals(w))
if __name__ == '__main__':
for i in 'robbery robery rober rborey'.split(' '):
test_word(i)
# -*- coding: utf-8 -*-
# @Author: cody
# @Date: 2016-09-30 12:30:30
# @Last Modified 2016-10-02
# @Last Modified time: 2016-10-02 18:56:17
from string_regression import *
from types import GeneratorType
import json
from repoze.lru import lru_cache
from math import sqrt
# regulates list of decimal signals to values between 0 and 1
def regulate_signals(list_of_tuples):
out = []
for dimension in range(len(list_of_tuples[0])):
# returns the values for the dimension
d = [ i[dimension] for i in list_of_tuples ]
d_offset = -min(d)
d_range = max(d)-min(d)
d_scale = 1.0/d_range
d_rescaled = [((d_offset+i)*d_scale) for i in d]
out.append(d_rescaled)
return out
def distance(p1, p2):
# calculates the distance between two points with unlimited dimensions
# by: Cody Kochmann
return sqrt(sum( (a-b)**2 for a,b in zip(p1,p2) ))
# calculates the nearest neighbor of the first arg in a list of args
# items the same spot do not count.
def nearest_neighbor(target,neighbors):
tmp = []
for i in neighbors:
if str(target) != str(i):
tmp.append(i)
min_distance = min(distance(n, target) for n in tmp )
for i in neighbors:
d=distance(i, target)
if d == min_distance:
return i,d
class Str_AI_Model(object):
"""docstring for Str_AI_Model"""
def __init__(self):
self.collection={}
""" consumes strings and lists/generators of strings """
def consume(self,i):
if isinstance(i, str): self.learn(i)
if isinstance(i, list) or isinstance(i, GeneratorType):
for s in i: self.learn(s)
def learn(self,s):
assert isinstance(s, str), "Str_AI_Model.learn needs a string, recieved:{}".format(s)
if len(s) > 2:
if s in self.collection:
self.collection[s]['count']+=1
else:
regression = string_regression(s)
self.collection[s] = {
'regression':regression,
'count':1
}
def map(self): # creates a map of the strings in its collection
strings = [ s for s in self.collection ]
signals = regulate_signals([self.collection[s]['regression'] for s in strings])
out = {}
for string,signal in zip(strings,signals):
for i in range(self.collection[string]['count']):
out[string]=signal
return out
# returns the corresponding string to each regression
def find_by_regression(self,reg):
m = self.map()
if reg in m.values():
for k in m:
if m[k] == reg:
return k
else:
return None
def print_collection(self): # prints the collection
print self.map()
def find_each_nearest_neighbor(self): # prints the nearest neighbor of each string
m = self.map()
v = m.values()
for k in m:
nearest, distance = nearest_neighbor(m[k], v)
print k, self.find_by_regression(nearest), distance
tester = Str_AI_Model()
tester.consume('hello my name is jack robbery hack jack jack robery rober rborey'.split(' '))
tester.print_collection()
tester.find_each_nearest_neighbor()
以上是关于python 在python中通过线性回归分析文本的主要内容,如果未能解决你的问题,请参考以下文章