python 在python中通过线性回归分析文本

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 在python中通过线性回归分析文本相关的知识,希望对你有一定的参考价值。

def regulate_dimension(list_of_signals):
    # translate numeric signals into signals between 0 and 1
    d_offset = -min(list_of_signals)
    d_range = max(list_of_signals)-min(list_of_signals)
    d_scale = 1.0/d_range
    return [((d_offset+i)*d_scale) for i in list_of_signals]

def regulate_signals(list_of_tuples):
    out = []
    for dimension in range(len(list_of_tuples[0])):
        # returns the values for the dimension
        d = [ i[dimension] for i in list_of_tuples ]
        out.append(regulate_dimension(d))
    return out

def str_to_signals(s):
    # converts characters in a string to floats in a list
    return [float(ord(c)) for c in s]
    return regulate_dimension([ float(ord(c)) for c in s ])

# -*- coding: utf-8 -*-
# @Author: cody
# @Date:   2016-09-23 00:03:16
# @Last Modified 2016-10-02
# @Last Modified time: 2016-10-02 18:54:41

from str_to_signals import str_to_signals, regulate_dimension

def linear_regression(x,y=[]):
    if len(y) < 1:
        x,y = range(len(x)),x
    n = len(x)
    X = sum(x)
    Y = sum(y)
    A = sum([xi**2 for xi in x])
    C = sum([xi*yi for xi, yi in zip(x,y)])
    D = X**2 - n*A
    slope = (X*Y - n*C) / (D)
    y_intercept = (C*X - A*Y) / (D)
    return (slope, y_intercept)


# data that can be harvested from a string
# length
# total
# regression
# di


# runs regression against a string
def str_regress(s): return linear_regression(str_to_signals(s))

# generates a list of regressions for every 3 char substring in a string
def char_to_char_regression(s):
    assert len(s)>=3, exit('need more letters than {}'.format(s))
    if len(s) > 3:
        previous = ''
        for c in s:
            previous+=c
            if len(previous) is 3:
                tmp=previous+c
                yield str_regress(tmp)
                previous = previous[1:]
    else:
        yield str_regress(s)

# joins a list of regression values with MORE REGRESSION!!!
def join_regression_models(models):
    x = [ z[0] for z in models ]
    y = [ z[1] for z in models ]
    avg = lambda i: sum(i)/float(len(i))
    #return avg(x), avg(y)
    return linear_regression(x, y)

# runs regression over list of strings and returns its joined regression
def str_list_regression(strings):
    assert len(strings)>1, exit('str_list_regression needs more strings than {}'.format(strings))
    return join_regression_models([str_regress(s) for s in strings])

# joins tuples into one large tuple
def join_tuples(list_of_tuples):
    o = []
    for t in list_of_tuples:
        if isinstance(t, tuple):
            for i in t:
                o.append(i)
        else:
            o.append(t)
    return(tuple(o))

# we could generate three fingerprints from the word
# - whole regression
# - regression of first half + second half
def harvest_str_signals(s):
    signals = str_to_signals(s)
    # this determines how long beginning_regression and end_regression are
    half_length = int(round(len(s)/2.0, 0))

    # where the models will be collected
    models = []

    # regression of the whole word
    models.append(str_regress(s)[0])

    # regression for every 3 char substring in s
    #models.append(join_regression_models( model for model in char_to_char_regression(s) ))

    # length of s
    models.append(float(len(s)))

    return join_tuples(models)

def test_word(w):
    print 'testing:',w
    print "full regression - {}".format(harvest_str_signals(w))


if __name__ == '__main__':
    for i in 'robbery robery rober rborey'.split(' '):
        test_word(i)
# -*- coding: utf-8 -*-
# @Author: cody
# @Date:   2016-09-30 12:30:30
# @Last Modified 2016-10-02
# @Last Modified time: 2016-10-02 18:56:17

from string_regression import *
from types import GeneratorType
import json
from repoze.lru import lru_cache
from math import sqrt

# regulates list of decimal signals to values between 0 and 1
def regulate_signals(list_of_tuples):
    out = []
    for dimension in range(len(list_of_tuples[0])):
        # returns the values for the dimension
        d = [ i[dimension] for i in list_of_tuples ]
        d_offset = -min(d)
        d_range = max(d)-min(d)
        d_scale = 1.0/d_range
        d_rescaled = [((d_offset+i)*d_scale) for i in d]
        out.append(d_rescaled)
    return out

def distance(p1, p2):
    # calculates the distance between two points with unlimited dimensions
    # by: Cody Kochmann
    return sqrt(sum( (a-b)**2 for a,b in zip(p1,p2) ))


# calculates the nearest neighbor of the first arg in a list of args
# items the same spot do not count.
def nearest_neighbor(target,neighbors):
    tmp = []
    for i in neighbors:
        if str(target) != str(i):
            tmp.append(i)
    min_distance = min(distance(n, target) for n in tmp )
    for i in neighbors:
        d=distance(i, target)
        if d == min_distance:
            return i,d




class Str_AI_Model(object):
    """docstring for Str_AI_Model"""
    def __init__(self):
        self.collection={}

    """ consumes strings and lists/generators of strings """
    def consume(self,i):
        if isinstance(i, str): self.learn(i)
        if isinstance(i, list) or isinstance(i, GeneratorType):
            for s in i: self.learn(s)

    def learn(self,s):
        assert isinstance(s, str), "Str_AI_Model.learn needs a string, recieved:{}".format(s)
        if len(s) > 2:
            if s in self.collection:
                self.collection[s]['count']+=1
            else:
                regression = string_regression(s)
                self.collection[s] = {
                    'regression':regression,
                    'count':1
                }

    def map(self): # creates a map of the strings in its collection
        strings = [ s for s in self.collection ]
        signals = regulate_signals([self.collection[s]['regression'] for s in strings])
        out = {}
        for string,signal in zip(strings,signals):
            for i in range(self.collection[string]['count']):
                out[string]=signal
        return out

    # returns the corresponding string to each regression
    def find_by_regression(self,reg):
        m = self.map()
        if reg in m.values():
            for k in m:
                if m[k] == reg:
                    return k
        else:
            return None

    def print_collection(self): # prints the collection
        print self.map()

    def find_each_nearest_neighbor(self): # prints the nearest neighbor of each string
        m = self.map()
        v = m.values()
        for k in m:
            nearest, distance = nearest_neighbor(m[k], v)
            print k, self.find_by_regression(nearest), distance

tester = Str_AI_Model()


tester.consume('hello my name is jack robbery hack jack jack robery rober rborey'.split(' '))

tester.print_collection()

tester.find_each_nearest_neighbor()


以上是关于python 在python中通过线性回归分析文本的主要内容,如果未能解决你的问题,请参考以下文章

python 数据科学 - 回归分析 ☞ 线性回归

如何用Python进行线性回归以及误差分析

如何用Python进行线性回归以及误差分析

python多元线性回归怎么计算

python进行数据分析----线性回归

python之简单线性回归分析