DocumentSimilarity

Posted 2020-10-12 WANGLC

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了DocumentSimilarity相关的知识，希望对你有一定的参考价值。

读取文件

abstracts = [line.strip() for line in file(‘../DATA/AbstractData.txt‘)]

print abstracts[:1]
[‘25177 Given n non-vertical lines in 3-space, their vertical depth (above/below) relation can contain cycles. We show that the lines can be cut into O(n3/2polylog n) pieces, such that the depth relation among these pieces is now a proper partial order. This bound is nearly tight in the worst case. As a consequence, we deduce that the number of pairwise non-overlapping cycles, namely, cycles whose xy-projections do not overlap, is O(n3/2polylog n); this bound too is almost tight in the worst case. Previous results on this topic could only handle restricted cases of the problem (such as handling only triangular cycles, by Aronov, Koltun, and Sharir, or only cycles in grid-like patterns, by Chazelle et al.), and the bounds were considerably weaker&#x2014;much closer to the trivial quadratic bound. Our proof uses a recent variant of the polynomial partitioning technique, due to Guth, and some simple tools from algebraic geometry. It is much more straightforward than the previous &#x201C;purely combinatorial&#x201D; methods. Our approach extends to eliminating all cycles in the depth relation among segments, and among constant-degree algebraic arcs. We hope that a suitable extension of this technique could be used to handle the much more difficult case of pairwise-disjoint triangles as well. Our results almost completely settle a long-standing (35 years old) open problem in computational geometry, motivated by hidden-surface removal in computer graphics. </p>‘]

获取摘要ID

abstrctsId = [abstract.split(‘ ‘)[0] for abstract in abstracts]
print abstrctsId[:1]
[‘25177‘]

大小写转换

abstractLower = [[word for word in abstract.lower().split()] for abstract in abstracts]

print abstractLower[:1]
[[‘25177‘, ‘given‘, ‘n‘, ‘non-vertical‘, ‘lines‘, ‘in‘, ‘3-space,‘, ‘their‘, ‘vertical‘, ‘depth‘, ‘(above/below)‘, ‘relation‘, ‘can‘, ‘contain‘, ‘cycles.‘, ‘we‘, ‘show‘, ‘that‘, ‘the‘, ‘lines‘, ‘can‘, ‘be‘, ‘cut‘, ‘into‘, ‘o(n3/2polylog‘, ‘n)‘, ‘pieces,‘, ‘such‘, ‘that‘, ‘the‘, ‘depth‘, ‘relation‘, ‘among‘, ‘these‘, ‘pieces‘, ‘is‘, ‘now‘, ‘a‘, ‘proper‘, ‘partial‘, ‘order.‘, ‘this‘, ‘bound‘, ‘is‘, ‘nearly‘, ‘tight‘, ‘in‘, ‘the‘, ‘worst‘, ‘case.‘, ‘as‘, ‘a‘, ‘consequence,‘, ‘we‘, ‘deduce‘, ‘that‘, ‘the‘, ‘number‘, ‘of‘, ‘pairwise‘, ‘non-overlapping‘, ‘cycles,‘, ‘namely,‘, ‘cycles‘, ‘whose‘, ‘xy-projections‘, ‘do‘, ‘not‘, ‘overlap,‘, ‘is‘, ‘o(n3/2polylog‘, ‘n);‘, ‘this‘, ‘bound‘, ‘too‘, ‘is‘, ‘almost‘, ‘tight‘, ‘in‘, ‘the‘, ‘worst‘, ‘case.‘, ‘previous‘, ‘results‘, ‘on‘, ‘this‘, ‘topic‘, ‘could‘, ‘only‘, ‘handle‘, ‘restricted‘, ‘cases‘, ‘of‘, ‘the‘, ‘problem‘, ‘(such‘, ‘as‘, ‘handling‘, ‘only‘, ‘triangular‘, ‘cycles,‘, ‘by‘, ‘aronov,‘, ‘koltun,‘, ‘and‘, ‘sharir,‘, ‘or‘, ‘only‘, ‘cycles‘, ‘in‘, ‘grid-like‘, ‘patterns,‘, ‘by‘, ‘chazelle‘, ‘et‘, ‘al.),‘, ‘and‘, ‘the‘, ‘bounds‘, ‘were‘, ‘considerably‘, ‘weaker&#x2014;much‘, ‘closer‘, ‘to‘, ‘the‘, ‘trivial‘, ‘quadratic‘, ‘bound.‘, ‘our‘, ‘proof‘, ‘uses‘, ‘a‘, ‘recent‘, ‘variant‘, ‘of‘, ‘the‘, ‘polynomial‘, ‘partitioning‘, ‘technique,‘, ‘due‘, ‘to‘, ‘guth,‘, ‘and‘, ‘some‘, ‘simple‘, ‘tools‘, ‘from‘, ‘algebraic‘, ‘geometry.‘, ‘it‘, ‘is‘, ‘much‘, ‘more‘, ‘straightforward‘, ‘than‘, ‘the‘, ‘previous‘, ‘&#x201c;purely‘, ‘combinatorial&#x201d;‘, ‘methods.‘, ‘our‘, ‘approach‘, ‘extends‘, ‘to‘, ‘eliminating‘, ‘all‘, ‘cycles‘, ‘in‘, ‘the‘, ‘depth‘, ‘relation‘, ‘among‘, ‘segments,‘, ‘and‘, ‘among‘, ‘constant-degree‘, ‘algebraic‘, ‘arcs.‘, ‘we‘, ‘hope‘, ‘that‘, ‘a‘, ‘suitable‘, ‘extension‘, ‘of‘, ‘this‘, ‘technique‘, ‘could‘, ‘be‘, ‘used‘, ‘to‘, ‘handle‘, ‘the‘, ‘much‘, ‘more‘, ‘difficult‘, ‘case‘, ‘of‘, ‘pairwise-disjoint‘, ‘triangles‘, ‘as‘, ‘well.‘, ‘our‘, ‘results‘, ‘almost‘, ‘completely‘, ‘settle‘, ‘a‘, ‘long-standing‘, ‘(35‘, ‘years‘, ‘old)‘, ‘open‘, ‘problem‘, ‘in‘, ‘computational‘, ‘geometry,‘, ‘motivated‘, ‘by‘, ‘hidden-surface‘, ‘removal‘, ‘in‘, ‘computer‘, ‘graphics.‘, ‘</p>‘]]

将标点符号与单词进行分离

from nltk.tokenize import word_tokenize
abstractsTokenized = [[word.lower() for word in word_tokenize(abstract.decode(‘utf-8‘))] for abstract in abstracts]

print abstractsTokenized[:1]
[[u‘25177‘, u‘given‘, u‘n‘, u‘non-vertical‘, u‘lines‘, u‘in‘, u‘3-space‘, u‘,‘, u‘their‘, u‘vertical‘, u‘depth‘, u‘(‘, u‘above/below‘, u‘)‘, u‘relation‘, u‘can‘, u‘contain‘, u‘cycles‘, u‘.‘, u‘we‘, u‘show‘, u‘that‘, u‘the‘, u‘lines‘, u‘can‘, u‘be‘, u‘cut‘, u‘into‘, u‘o‘, u‘(‘, u‘n3/2polylog‘, u‘n‘, u‘)‘, u‘pieces‘, u‘,‘, u‘such‘, u‘that‘, u‘the‘, u‘depth‘, u‘relation‘, u‘among‘, u‘these‘, u‘pieces‘, u‘is‘, u‘now‘, u‘a‘, u‘proper‘, u‘partial‘, u‘order‘, u‘.‘, u‘this‘, u‘bound‘, u‘is‘, u‘nearly‘, u‘tight‘, u‘in‘, u‘the‘, u‘worst‘, u‘case‘, u‘.‘, u‘as‘, u‘a‘, u‘consequence‘, u‘,‘, u‘we‘, u‘deduce‘, u‘that‘, u‘the‘, u‘number‘, u‘of‘, u‘pairwise‘, u‘non-overlapping‘, u‘cycles‘, u‘,‘, u‘namely‘, u‘,‘, u‘cycles‘, u‘whose‘, u‘xy-projections‘, u‘do‘, u‘not‘, u‘overlap‘, u‘,‘, u‘is‘, u‘o‘, u‘(‘, u‘n3/2polylog‘, u‘n‘, u‘)‘, u‘;‘, u‘this‘, u‘bound‘, u‘too‘, u‘is‘, u‘almost‘, u‘tight‘, u‘in‘, u‘the‘, u‘worst‘, u‘case‘, u‘.‘, u‘previous‘, u‘results‘, u‘on‘, u‘this‘, u‘topic‘, u‘could‘, u‘only‘, u‘handle‘, u‘restricted‘, u‘cases‘, u‘of‘, u‘the‘, u‘problem‘, u‘(‘, u‘such‘, u‘as‘, u‘handling‘, u‘only‘, u‘triangular‘, u‘cycles‘, u‘,‘, u‘by‘, u‘aronov‘, u‘,‘, u‘koltun‘, u‘,‘, u‘and‘, u‘sharir‘, u‘,‘, u‘or‘, u‘only‘, u‘cycles‘, u‘in‘, u‘grid-like‘, u‘patterns‘, u‘,‘, u‘by‘, u‘chazelle‘, u‘et‘, u‘al‘, u‘.‘, u‘)‘, u‘,‘, u‘and‘, u‘the‘, u‘bounds‘, u‘were‘, u‘considerably‘, u‘weaker‘, u‘&‘, u‘#‘, u‘x2014‘, u‘;‘, u‘much‘, u‘closer‘, u‘to‘, u‘the‘, u‘trivial‘, u‘quadratic‘, u‘bound‘, u‘.‘, u‘our‘, u‘proof‘, u‘uses‘, u‘a‘, u‘recent‘, u‘variant‘, u‘of‘, u‘the‘, u‘polynomial‘, u‘partitioning‘, u‘technique‘, u‘,‘, u‘due‘, u‘to‘, u‘guth‘, u‘,‘, u‘and‘, u‘some‘, u‘simple‘, u‘tools‘, u‘from‘, u‘algebraic‘, u‘geometry‘, u‘.‘, u‘it‘, u‘is‘, u‘much‘, u‘more‘, u‘straightforward‘, u‘than‘, u‘the‘, u‘previous‘, u‘&‘, u‘#‘, u‘x201c‘, u‘;‘, u‘purely‘, u‘combinatorial‘, u‘&‘, u‘#‘, u‘x201d‘, u‘;‘, u‘methods‘, u‘.‘, u‘our‘, u‘approach‘, u‘extends‘, u‘to‘, u‘eliminating‘, u‘all‘, u‘cycles‘, u‘in‘, u‘the‘, u‘depth‘, u‘relation‘, u‘among‘, u‘segments‘, u‘,‘, u‘and‘, u‘among‘, u‘constant-degree‘, u‘algebraic‘, u‘arcs‘, u‘.‘, u‘we‘, u‘hope‘, u‘that‘, u‘a‘, u‘suitable‘, u‘extension‘, u‘of‘, u‘this‘, u‘technique‘, u‘could‘, u‘be‘, u‘used‘, u‘to‘, u‘handle‘, u‘the‘, u‘much‘, u‘more‘, u‘difficult‘, u‘case‘, u‘of‘, u‘pairwise-disjoint‘, u‘triangles‘, u‘as‘, u‘well‘, u‘.‘, u‘our‘, u‘results‘, u‘almost‘, u‘completely‘, u‘settle‘, u‘a‘, u‘long-standing‘, u‘(‘, u‘35‘, u‘years‘, u‘old‘, u‘)‘, u‘open‘, u‘problem‘, u‘in‘, u‘computational‘, u‘geometry‘, u‘,‘, u‘motivated‘, u‘by‘, u‘hidden-surface‘, u‘removal‘, u‘in‘, u‘computer‘, u‘graphics‘, u‘.‘, u‘<‘, u‘/p‘, u‘>‘]]

除去停用词

from nltk.corpus import stopwords
englishStopwords = stopwords.words(‘english‘)
print len(englishStopwords)
abstractFilterStopwords = [[word for word in abstract if not word in englishStopwords] for abstract in abstractsTokenized]
print abstractFilterStopwords[:1]

[[u‘25177‘, u‘given‘, u‘n‘, u‘non-vertical‘, u‘lines‘, u‘3-space‘, u‘,‘, u‘vertical‘, u‘depth‘, u‘(‘, u‘above/below‘, u‘)‘, u‘relation‘, u‘contain‘, u‘cycles‘, u‘.‘, u‘show‘, u‘lines‘, u‘cut‘, u‘o‘, u‘(‘, u‘n3/2polylog‘, u‘n‘, u‘)‘, u‘pieces‘, u‘,‘, u‘depth‘, u‘relation‘, u‘among‘, u‘pieces‘, u‘proper‘, u‘partial‘, u‘order‘, u‘.‘, u‘bound‘, u‘nearly‘, u‘tight‘, u‘worst‘, u‘case‘, u‘.‘, u‘consequence‘, u‘,‘, u‘deduce‘, u‘number‘, u‘pairwise‘, u‘non-overlapping‘, u‘cycles‘, u‘,‘, u‘namely‘, u‘,‘, u‘cycles‘, u‘whose‘, u‘xy-projections‘, u‘overlap‘, u‘,‘, u‘o‘, u‘(‘, u‘n3/2polylog‘, u‘n‘, u‘)‘, u‘;‘, u‘bound‘, u‘almost‘, u‘tight‘, u‘worst‘, u‘case‘, u‘.‘, u‘previous‘, u‘results‘, u‘topic‘, u‘could‘, u‘handle‘, u‘restricted‘, u‘cases‘, u‘problem‘, u‘(‘, u‘handling‘, u‘triangular‘, u‘cycles‘, u‘,‘, u‘aronov‘, u‘,‘, u‘koltun‘, u‘,‘, u‘sharir‘, u‘,‘, u‘cycles‘, u‘grid-like‘, u‘patterns‘, u‘,‘, u‘chazelle‘, u‘et‘, u‘al‘, u‘.‘, u‘)‘, u‘,‘, u‘bounds‘, u‘considerably‘, u‘weaker‘, u‘&‘, u‘#‘, u‘x2014‘, u‘;‘, u‘much‘, u‘closer‘, u‘trivial‘, u‘quadratic‘, u‘bound‘, u‘.‘, u‘proof‘, u‘uses‘, u‘recent‘, u‘variant‘, u‘polynomial‘, u‘partitioning‘, u‘technique‘, u‘,‘, u‘due‘, u‘guth‘, u‘,‘, u‘simple‘, u‘tools‘, u‘algebraic‘, u‘geometry‘, u‘.‘, u‘much‘, u‘straightforward‘, u‘previous‘, u‘&‘, u‘#‘, u‘x201c‘, u‘;‘, u‘purely‘, u‘combinatorial‘, u‘&‘, u‘#‘, u‘x201d‘, u‘;‘, u‘methods‘, u‘.‘, u‘approach‘, u‘extends‘, u‘eliminating‘, u‘cycles‘, u‘depth‘, u‘relation‘, u‘among‘, u‘segments‘, u‘,‘, u‘among‘, u‘constant-degree‘, u‘algebraic‘, u‘arcs‘, u‘.‘, u‘hope‘, u‘suitable‘, u‘extension‘, u‘technique‘, u‘could‘, u‘used‘, u‘handle‘, u‘much‘, u‘difficult‘, u‘case‘, u‘pairwise-disjoint‘, u‘triangles‘, u‘well‘, u‘.‘, u‘results‘, u‘almost‘, u‘completely‘, u‘settle‘, u‘long-standing‘, u‘(‘, u‘35‘, u‘years‘, u‘old‘, u‘)‘, u‘open‘, u‘problem‘, u‘computational‘, u‘geometry‘, u‘,‘, u‘motivated‘, u‘hidden-surface‘, u‘removal‘, u‘computer‘, u‘graphics‘, u‘.‘, u‘<‘, u‘/p‘, u‘>‘]]

除去标点符号

englishPunctuations = [‘,‘, ‘.‘, ‘:‘, ‘;‘, ‘?‘, ‘(‘, ‘)‘, ‘[‘, ‘]‘, ‘&‘, ‘!‘, ‘*‘, ‘@‘, ‘#‘, ‘$‘, ‘%‘,‘<‘,‘>‘,‘=‘,‘{‘,‘}‘,‘+‘,‘"‘,‘-‘,‘/‘]
abstracts = [[word for word in abstract if not word in englishPunctuations] for abstract in abstractFilterStopwords]

print abstracts[:1]
[[u‘25177‘, u‘given‘, u‘n‘, u‘non-vertical‘, u‘lines‘, u‘3-space‘, u‘vertical‘, u‘depth‘, u‘above/below‘, u‘relation‘, u‘contain‘, u‘cycles‘, u‘show‘, u‘lines‘, u‘cut‘, u‘o‘, u‘n3/2polylog‘, u‘n‘, u‘pieces‘, u‘depth‘, u‘relation‘, u‘among‘, u‘pieces‘, u‘proper‘, u‘partial‘, u‘order‘, u‘bound‘, u‘nearly‘, u‘tight‘, u‘worst‘, u‘case‘, u‘consequence‘, u‘deduce‘, u‘number‘, u‘pairwise‘, u‘non-overlapping‘, u‘cycles‘, u‘namely‘, u‘cycles‘, u‘whose‘, u‘xy-projections‘, u‘overlap‘, u‘o‘, u‘n3/2polylog‘, u‘n‘, u‘bound‘, u‘almost‘, u‘tight‘, u‘worst‘, u‘case‘, u‘previous‘, u‘results‘, u‘topic‘, u‘could‘, u‘handle‘, u‘restricted‘, u‘cases‘, u‘problem‘, u‘handling‘, u‘triangular‘, u‘cycles‘, u‘aronov‘, u‘koltun‘, u‘sharir‘, u‘cycles‘, u‘grid-like‘, u‘patterns‘, u‘chazelle‘, u‘et‘, u‘al‘, u‘bounds‘, u‘considerably‘, u‘weaker‘, u‘x2014‘, u‘much‘, u‘closer‘, u‘trivial‘, u‘quadratic‘, u‘bound‘, u‘proof‘, u‘uses‘, u‘recent‘, u‘variant‘, u‘polynomial‘, u‘partitioning‘, u‘technique‘, u‘due‘, u‘guth‘, u‘simple‘, u‘tools‘, u‘algebraic‘, u‘geometry‘, u‘much‘, u‘straightforward‘, u‘previous‘, u‘x201c‘, u‘purely‘, u‘combinatorial‘, u‘x201d‘, u‘methods‘, u‘approach‘, u‘extends‘, u‘eliminating‘, u‘cycles‘, u‘depth‘, u‘relation‘, u‘among‘, u‘segments‘, u‘among‘, u‘constant-degree‘, u‘algebraic‘, u‘arcs‘, u‘hope‘, u‘suitable‘, u‘extension‘, u‘technique‘, u‘could‘, u‘used‘, u‘handle‘, u‘much‘, u‘difficult‘, u‘case‘, u‘pairwise-disjoint‘, u‘triangles‘, u‘well‘, u‘results‘, u‘almost‘, u‘completely‘, u‘settle‘, u‘long-standing‘, u‘35‘, u‘years‘, u‘old‘, u‘open‘, u‘problem‘, u‘computational‘, u‘geometry‘, u‘motivated‘, u‘hidden-surface‘, u‘removal‘, u‘computer‘, u‘graphics‘, u‘/p‘]]

单词词干化

from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
abstractsStemed = [[st.stem(word) for word in abstract] for abstract in abstracts]
print abstractsStemed[:1]

[[u‘25177‘, u‘giv‘, u‘n‘, u‘non-vertical‘, u‘lin‘, u‘3-space‘, u‘vert‘, u‘dep‘, u‘above/below‘, u‘rel‘, u‘contain‘, u‘cyc‘, u‘show‘, u‘lin‘, u‘cut‘, u‘o‘, u‘n3/2polylog‘, u‘n‘, u‘piec‘, u‘dep‘, u‘rel‘, u‘among‘, u‘piec‘, u‘prop‘, u‘part‘, u‘ord‘, u‘bound‘, u‘near‘, u‘tight‘, u‘worst‘, u‘cas‘, u‘consequ‘, u‘deduc‘, u‘numb‘, u‘pairw‘, u‘non-overlapping‘, u‘cyc‘, u‘nam‘, u‘cyc‘, u‘whos‘, u‘xy-projections‘, u‘overlap‘, u‘o‘, u‘n3/2polylog‘, u‘n‘, u‘bound‘, u‘almost‘, u‘tight‘, u‘worst‘, u‘cas‘, u‘prevy‘, u‘result‘, u‘top‘, u‘could‘, u‘handl‘, u‘restrict‘, u‘cas‘, u‘problem‘, u‘handl‘, u‘triangul‘, u‘cyc‘, u‘aronov‘, u‘koltun‘, u‘sharir‘, u‘cyc‘, u‘grid-like‘, u‘pattern‘, u‘chazel‘, u‘et‘, u‘al‘, u‘bound‘, u‘consid‘, u‘weak‘, u‘x2014‘, u‘much‘, u‘clos‘, u‘triv‘, u‘quadr‘, u‘bound‘, u‘proof‘, u‘us‘, u‘rec‘, u‘vary‘, u‘polynom‘, u‘partit‘, u‘techn‘, u‘due‘, u‘guth‘, u‘simpl‘, u‘tool‘, u‘algebra‘, u‘geometry‘, u‘much‘, u‘straightforward‘, u‘prevy‘, u‘x201c‘, u‘pur‘, u‘combin‘, u‘x201d‘, u‘method‘, u‘approach‘, u‘extend‘, u‘elimin‘, u‘cyc‘, u‘dep‘, u‘rel‘, u‘among‘, u‘seg‘, u‘among‘, u‘constant-degree‘, u‘algebra‘, u‘arc‘, u‘hop‘, u‘suit‘, u‘extend‘, u‘techn‘, u‘could‘, u‘us‘, u‘handl‘, u‘much‘, u‘difficult‘, u‘cas‘, u‘pairwise-disjoint‘, u‘triangl‘, u‘wel‘, u‘result‘, u‘almost‘, u‘complet‘, u‘settl‘, u‘long-stand‘, u‘35‘, u‘year‘, u‘old‘, u‘op‘, u‘problem‘, u‘comput‘, u‘geometry‘, u‘mot‘, u‘hidden-surface‘, u‘remov‘, u‘comput‘, u‘graph‘, u‘/p‘]]

去除低频词

allStem = sum(abstractsStemed,[])
#找到出现频率为1的词干
stemOnce = [stem for stem in set(allStem) if allStem.count(stem) == 1]
abstractOver = [[for word in abstract if not word in stemOnce] for abstract in abstractsStemed]
print abstractOver[:1]

以上是关于DocumentSimilarity的主要内容，如果未能解决你的问题，请参考以下文章