从Go中的给定字符串计算句子中的最大单词

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了从Go中的给定字符串计算句子中的最大单词相关的知识,希望对你有一定的参考价值。

我是Go的新手......我正在寻找新的方法来优化和/或修复此算法,以计算来自给定字符串的句子中的最大单词。句子以'?'结尾要么 '!'要么 '。'和函数应该返回int> = 0。

// MaxWordsInSentences - return max words in one sentences
func MaxWordsInSentences(S string) (result int) {


    r, _ := regexp.Compile("[.||?||!]")
    count := strings.Count(S, ".") + strings.Count(S, "!") + strings.Count(S, "?") // Total sentaces

    for i := 0; i < count; i++ {
        sentence := r.Split(S, count)[i]
        splitSentence := strings.Split(sentence, " ")

        var R []string
        for _, str := range splitSentence {
            if str != "" {
                R = append(R, str)
            }
        }

        if len(R) > result {
            result = len(R)
        }
    }

    return

}

句子=> "One two three four five six seven eight. One two? One two three four five six seven eight nine? One two three! One two three four."

结果应该返回9

答案

在您提供的简单测试用例中,您的算法似乎有效。您的算法在真实文本上不能很好地工作。


考虑我的简单算法:

func maxSentenceWords(s string) int {
    maxWords, nWords := 0, 0
    inWord := false
    for _, r := range s {
        switch r {
        case '.', '?', '!':
            inWord = false
            if maxWords < nWords {
                maxWords = nWords
            }
            nWords = 0
        default:
            if unicode.IsSpace(r) {
                inWord = false
            } else if inWord == false {
                inWord = true
                nWords++
            }
        }
        if maxWords < nWords {
            maxWords = nWords
        }
    }
    return maxWords
}

游乐场:https://play.golang.org/p/OD8jNW1hyAa

它通过了你的简单测试。短基准(Lorem Ipsum)运行得非常快,长基准(莎士比亚)运行得很快

$ go test words_test.go -run=PeterSO -v -bench=PeterSO -benchmem -timeout=5m
=== RUN   TestPeterSO
--- PASS: TestPeterSO (0.00s)
BenchmarkPeterSOL-4    300000        4027 ns/op    0 B/op    0 allocs/op
BenchmarkPeterSOS-4        20    54084832 ns/op    0 B/op    0 allocs/op
$ 

考虑您的复杂算法:

func MaxWordsInSentences(S string) (result int) {
    r, _ := regexp.Compile("[.||?||!]")
    count := strings.Count(S, ".") + strings.Count(S, "!") + strings.Count(S, "?") // Total sentaces

    for i := 0; i < count; i++ {
        sentence := r.Split(S, count)[i]
        splitSentence := strings.Split(sentence, " ")

        var R []string
        for _, str := range splitSentence {
            if str != "" {
                R = append(R, str)
            }
        }

        if len(R) > result {
            result = len(R)
        }
    }

    return
}

游乐场:https://play.golang.org/p/MCj-XxEid73

它通过了你的简单测试。短基准(Lorem Ipsum)运行缓慢,长基准(莎士比亚)运行很长时间(5分钟后死亡)。

$ go test words_test.go -run=Ljubon -v -bench=Ljubon -benchmem -timeout=5m
=== RUN   TestLjubon
--- PASS: TestLjubon (0.00s)
BenchmarkLjubonL-4    20000    78623 ns/op    6984 B/op   62 allocs/op
*** Test killed with quit: ran too long (6m0s).
$ 

test words_test.go

package main

import (
    "fmt"
    "io/ioutil"
    "regexp"
    "strings"
    "testing"
    "unicode"
)

var sentences = "One two three four five six seven eight. One two? One two three four five six seven eight nine? One two three! One two three four."

var loremipsum = `
Lorem ipsum dolor sit amet, consectetur adipiscing elit, 
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. 
Ut enim ad minim veniam, 
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. 
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. 
Excepteur sint occaecat cupidatat non proident, 
sunt in culpa qui officia deserunt mollit anim id est laborum.
`

var shakespeare = func() string {
    // The Complete Works of William Shakespeare by William Shakespeare
    // http://www.gutenberg.org/files/100/100-0.txt
    data, err := ioutil.ReadFile(`/home/peter/shakespeare.100-0.txt`)
    if err != nil {
        panic(err)
    }
    return string(data)
}()

func maxSentenceWords(s string) int {
    maxWords, nWords := 0, 0
    inWord := false
    for _, r := range s {
        switch r {
        case '.', '?', '!':
            inWord = false
            if maxWords < nWords {
                maxWords = nWords
            }
            nWords = 0
        default:
            if unicode.IsSpace(r) {
                inWord = false
            } else if inWord == false {
                inWord = true
                nWords++
            }
        }
        if maxWords < nWords {
            maxWords = nWords
        }
    }
    return maxWords
}

func TestPeterSO(t *testing.T) {
    want := 9
    got := maxSentenceWords(sentences)
    if got != want {
        t.Errorf("want %d; got %d", want, got)
    }
}

func BenchmarkPeterSOL(b *testing.B) {
    for N := 0; N < b.N; N++ {
        maxSentenceWords(loremipsum)
    }
}

func BenchmarkPeterSOS(b *testing.B) {
    for N := 0; N < b.N; N++ {
        maxSentenceWords(shakespeare)
    }
}

// MaxWordsInSentences - return max words in one sentences
func MaxWordsInSentences(S string) (result int) {
    r, _ := regexp.Compile("[.||?||!]")
    count := strings.Count(S, ".") + strings.Count(S, "!") + strings.Count(S, "?") // Total sentaces

    for i := 0; i < count; i++ {
        sentence := r.Split(S, count)[i]
        splitSentence := strings.Split(sentence, " ")

        var R []string
        for _, str := range splitSentence {
            if str != "" {
                R = append(R, str)
            }
        }

        if len(R) > result {
            result = len(R)
        }
    }

    return
}

func TestLjubon(t *testing.T) {
    want := 9
    got := MaxWordsInSentences(sentences)
    if got != want {
        t.Errorf("want %d; got %d", want, got)
    }
}

func BenchmarkLjubonL(b *testing.B) {
    for N := 0; N < b.N; N++ {
        MaxWordsInSentences(loremipsum)
    }
}

func BenchmarkLjubonS(b *testing.B) {
    for N := 0; N < b.N; N++ {
        MaxWordsInSentences(shakespeare)
    }
}

func main() {
    s := "One two three four five six seven eight. One two? One two three four five six seven eight nine? One two three! One two three four."
    max := maxSentenceWords(s) // 9
    fmt.Println(max)
    s = "One two three! One two three four"
    max = maxSentenceWords(s) // 4
    fmt.Println(max)
    s = loremipsum
    max = maxSentenceWords(s)
    fmt.Println(max)
}

我称之为乐器的法则,它可以表述如下:给一个小男孩一把锤子,他会发现他遇到的一切都需要冲击。

亚伯拉罕卡普兰,“探究行为:行为科学方法论”,1964年,第28页。


Go regexp包装你的锤子砸任何和所有文本?

另一答案
func MaxWordsInSentences(s string) (result int) {
    var offset, wordCount int
    for i, r := range s {
        switch r {
        default:
            if unicode.IsSpace(r) {
                wordCount++
            }

        case '.', '?', '!':
            if wordCount > result {
                result = wordCount
            }

            wordCount = 1
        }
    }

    return
}

以上是关于从Go中的给定字符串计算句子中的最大单词的主要内容,如果未能解决你的问题,请参考以下文章

使用 C++ 反转句子中的每个单词需要对我的代码片段进行代码优化

给定单词相似度推断句子相似度

2021-10-16:单词拆分 II。给定一个非空字符串 s 和一个包含非空单词列表的字典 wordDict,在字符串中增加空格来构建一个句子,使得句子中所有的单词都在词典中。返回所有这些可能的句子。

算法2114. 句子中的最多单词数(java / c / c++ / python / go / rust)

算法2114. 句子中的最多单词数(java / c / c++ / python / go / rust)

从数据库中替换句子中的单词(Python / Django)