将句子大小写应用于可能包含 HTML 的字符串

Apply Sentence Case to string which might contain HTML

我有一系列字符串需要转换为 "sentence case",但更复杂的是字符串可能有 html 个锚标记。

一个可能包含 html 这样的:

<a href="/foo">foo</a> is a word. this is another word, and <a href="/bar">bar</a> is another.

我想应用句子大小写,输出如下:

<a href="/foo">Foo</a> is a word. This is another word, and <a href="/bar">bar</a> is another.

我可以使用任何利用 jsvbscript 的解决方案。

我想你可以构建一个相当简单的方法,它简单地遍历字符串并根据遇到的情况标记条件(即它设置一个 inHtml 标记以指示它在 HTML 标记并设置另一个 shouldCapitalize 标志以确定它是否在句子的开头:

function titleCaseHtmlSentence(s){
    // A temporary string to hold your results
    var result = '';
    // Iterate through the sentence and check each character to determine if 
    // it is the start of a sentence, ignore this
    var shouldCapitalize = true;
    var inHtml = false;
    for(var i = 0; i < s.length; i++){
        // If this is any non tag, punctuation or letter or we are in HTML
        // and haven't encountered a closing tag
        if(/[^a-zA-Z\?\.\>\<\!]/.test(s[i]) || (inHtml && s[i] != '>')){
            result += s[i];
            continue;   
        }
        // If we should capitalize, check if we can
        if(shouldCapitalize && /[a-zA-Z]/.test(s[i])){
            // Capitalize this character
            result += s[i].toUpperCase();
            shouldCapitalize = false;
            continue;
        }
        else{
            result += s[i];
            // If this character is '<', then we are in HTML, so ignore these
            if(s[i] == '<'){
                inHtml = true;
                continue;
            }
            // If the character is a closing tag '>', then start paying attention again
            if(s[i] == '>'){
                inHtml = false;
                continue;
            }

            // Determine if we hit punctuation to start a new sentence
            if(/[\?\!\.]/.test(s[i])){
                shouldCapitalize = true;
                continue;
            }
        }
    }
    return result;
}

我相当仓促地将它组合在一起,所以我确信这在任何意义上都远非最佳,但它应该像 seen in this example.

万一有人在看,这里是 Rion Williams 逻辑到 vbScript 函数的端口。我使用了我自己的 class 库中的一些函数,因此也只包括其中需要的部分,以供参考。

正如 Rion 所说,这只是一个开始,还需要进行大量微调。

Function toSentenceCase(byVal x)
    Dim i, r, s, bCapitalize, bInHtml

    bCapitalize = True
    bInHtml = False

    Set r = New regularExpression
    Set s = New adoStream

    For i = 1 To Len(x)
        sChar = Mid(x, i, 1)
        Do
            'If this is any non tag, punctuation or letter or we are in HTML and haven't encountered a closing tag
            If r.test("[^a-zA-Z\?\.\>\<\!]", sChar) Or (bInHtml And sChar <> ">") Then 
                s sChar
                Exit Do
            End If

            'if we should capitalize, check if we can, and if yes, then capitalize
            If bCapitalize And r.test("[a-zA-Z]", sChar) Then 
                s uCase(sChar)
                bCapitalize = False
                Exit Do
            Else 
                s sChar

                'if this character is '<', then we are in HTML, so ignore these
                If sChar = "<" Then 
                    bInHtml = True
                    Exit Do
                End If

                'if the character is a closing tag '>', then start paying attention again
                If sChar = ">" Then
                    bInHtml = False
                    Exit Do
                End If

                'determine if we hit punctuation to start a new sentence
                If r.test("[\?\!\.]", sChar) Then
                    bCapitalize = True
                    Exit Do
                End If  

            End If 

        Loop While False
    Next

    toSentenceCase = s.Value
End Function 

Class adoStream
    'string builder class. adodb streams are way faster than appending to/editing content of string variables
    Private stream

    Private Sub Class_Initialize()                  
        Set stream = CreateObject("ADODB.Stream")
        stream.Type = 2 '2 = text stream
        stream.Open
    End Sub

    Private Sub Class_Terminate()
        stream.Close
        Set stream = Nothing
    End Sub

    Public Default Sub Add(byVal addString) 'add string to existing stream
        stream.WriteText addString
    End Sub

    Public Sub Update(byVal addString) 'update existing stream and set it to a new value. clear existing stream and set it = new value
        Clear
        stream.WriteText addString
    End Sub

    Public Property Get Value 'returns full stream
        stream.Position = 0
        Value = stream.ReadText()
    End Property

    Public Function Clear() 'resets stream
        stream.Position = 0
        Call stream.SetEOS()
    End Function        
End Class


Class regularExpression
    'class containing a set of vbscript regex routines
    Private oRegex
    Private Sub Class_Initialize()                  
        Set oRegex = New RegExp
        oRegex.Global = True    'e.g. findall
        oRegex.IgnoreCase = True
    End Sub

    Private Sub Class_Terminate()
        Set oRegex = Nothing
    End Sub

    'test
    Public Function test(byVal sPattern, byVal sTestString) 'return t/f
        If isNull(sTestString) Then 
            test = False
            Exit Function
        End If
        oRegex.Pattern = sPattern
        test = oRegex.test(sTestString)
    End Function
End Class