确定一个词是否在匹配项的 125 个词以内

Determine if a word is within 125 words of a match

我想遍历一个文档,对于每个单词,看看是否在 250 个单词(后面 125 个,前面 125 个)之内匹配。

如果有匹配项,请突出显示。某些词被排除在外。这些存储在字典中。

为了测试我正在使用的循环,

For Each para In ActiveDocument.Paragraphs
    For Each wrd In para.Range.Words
        Debug.Print wrd & "----" & wrd.Start
    Next wrd
Next para

问题:

“我讨厌去灵界旅行”这句话中的“世界”,在找7的时候打印32。

我想做这样的事情:

If wrd < 125 Then
    Set wrdRng = ActiveDocument.Range(Start:=wrd - 125, End:=ActiveDocument.Words(wrd + 125).End)
Else 
    Set wrdRng = ActiveDocument.Range(Start:=0, End:=ActiveDocument.Words(250 - wrd).End)
End if

编辑:

我目前使用的代码在大约 13 分钟内完成一个 50,000 字文档的循环。那太长了。谁有更好的选择?

Sub Demo()
Application.ScreenUpdating = False
Dim i As Long
Dim StartTime As Double
Dim MinutesElapsed As String


StartTime = Timer

For Each Para In ActiveDocument.Paragraphs
For Each wrd In Para.Range.Words
With ActiveDocument.Range
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Text = "<(McKnight)*>"
    .Replacement.Text = ""
    .Forward = True
    .Wrap = wdFindStop
    .Format = False
    .MatchCase = False
    .MatchWildcards = True
  End With
  Do While .Find.Execute
    If .ComputeStatistics(wdStatisticWords) < 100 Then
      i = i + 1
      .Words.First.HighlightColorIndex = wdBrightGreen
      .Words.Last.HighlightColorIndex = wdBrightGreen
    End If
    .End = .End - Len(.Words.Last)
    .Collapse wdCollapseEnd
  Loop
End With
Application.ScreenUpdating = True
'MsgBox i & " instances found."
Debug.Print wrd
    Next wrd
Next Para
MinutesElapsed = Format((Timer - StartTime) / 86400, "hh:mm:ss")
MsgBox "This code ran successfully in " & MinutesElapsed & " minutes", vbInformation

End Sub

编辑:

Sub Demo()
Application.ScreenUpdating = False
Dim i As Long
Dim StartTime As Double
Dim MinutesElapsed As String


StartTime = Timer

For Each para In ActiveDocument.Paragraphs
For Each wrd In para.Range.Words
With ActiveDocument.Range
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Text = "<(wrd)*>"
    .Replacement.Text = ""
    .Forward = True
    .Wrap = wdFindStop
    .Format = False
    .MatchCase = False
    .MatchWildcards = True
  End With
  Do While .Find.Execute
    If .ComputeStatistics(wdStatisticWords) < 100 Then
      i = i + 1
      .Words.First.HighlightColorIndex = wdBrightGreen
      .Words.Last.HighlightColorIndex = wdBrightGreen
    End If
    .End = .End - Len(.Words.Last)
    .Collapse wdCollapseEnd
  Loop
End With
Application.ScreenUpdating = True
'MsgBox i & " instances found."
'Debug.Print wrd
    Next wrd
Next para
MinutesElapsed = Format((Timer - StartTime) / 86400, "hh:mm:ss")
MsgBox "This code ran successfully in " & MinutesElapsed & " minutes", vbInformation

End Sub

尝试以下操作。除此之外,它还允许您指定要忽略的词(例如介词、冠词等)。此外,不同的突出显示用于识别给定单词上的所有 'hits'。进度报告显示在状态栏上。在我的笔记本电脑上,一份 50,000 字 'lorem' 的文档大约需要 6:40。

Option Explicit
Dim ArrOut() As String

Sub Demo()
' Turn Off Screen Updating
Application.ScreenUpdating = False
Dim eTime As Single
' Start Timing
eTime = Timer
Dim wdDoc As Document, StrFnd As String, StrTmp As String, Rng As Range
Dim SBar As Boolean, bTrk As Boolean, h As Long, i As Long, j As Long
' Store current Status Bar status, then switch on
SBar = Application.DisplayStatusBar
Application.DisplayStatusBar = True
Set wdDoc = ActiveDocument
With wdDoc
  ' Store current Track Changes status, then switch off
  bTrk = .TrackRevisions: .TrackRevisions = False
  'Display status
   Application.StatusBar = "Building word list"
  'Compile the Find list
  Call BuildWordList(.Range.Text)
  With .Range.Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Text = ""
    .Replacement.Text = ""
    .Format = False
    .MatchCase = False
    .MatchWholeWord = True
    .Wrap = wdFindStop
    .Execute
  End With
End With
'Process all words in the concordance
For i = 0 To UBound(ArrOut())
  StrFnd = ArrOut(i)
  h = i Mod 14
  If h < 6 Then
    h = h + 2
  Else
    h = h + 3
  End If
  'Display current word
  Application.StatusBar = "Processing: " & StrFnd
  'Use wildcards, if possible, for extra speed
  If Len(StrFnd) < 4 Then
    StrTmp = ""
    For j = 1 To Len(StrFnd)
      StrTmp = StrTmp & "[" & UCase(Mid(StrFnd, j, 1)) & Mid(StrFnd, j, 1) & "]"
    Next
    StrFnd = StrTmp
    With wdDoc.Range
      With .Find
        .MatchWildcards = True
        .Text = "<(" & StrFnd & ")>*<(" & StrFnd & ")>"
        .Forward = True
        .Wrap = wdFindStop
      End With
      Do While .Find.Execute
        If .ComputeStatistics(wdStatisticWords) < 100 Then
          If .Words.First.HighlightColorIndex <> h Then .Words.First.HighlightColorIndex = h
          .Words.Last.HighlightColorIndex = h
        End If
        .End = .End - Len(.Words.Last)
        .Collapse wdCollapseEnd
      Loop
    End With
  Else
    With wdDoc.Range
      With .Find
        .MatchWildcards = False
        .Text = StrFnd
        .Forward = True
        .Wrap = wdFindStop
        .Execute
      End With
      Set Rng = .Duplicate
      Do While .Find.Execute
        Rng.End = .Duplicate.End
        With Rng
          If .ComputeStatistics(wdStatisticWords) < 100 Then
            If .Words.First.HighlightColorIndex <> h Then .Words.First.HighlightColorIndex = h
            .Words.Last.HighlightColorIndex = h
          End If
        End With
        Set Rng = .Duplicate
        .Collapse wdCollapseEnd
      Loop
    End With
  End If
  DoEvents
Next
' Restore original Track Changes status
wdDoc.TrackRevisions = bTrk
' Clear the Status Bar
Application.StatusBar = False
' Restore original Status Bar status
Application.DisplayStatusBar = SBar
' Restore Screen Updating
Application.ScreenUpdating = True
' Calculate elapsed time
eTime = (Timer - eTime + 86400) Mod 86400 ' Just in case execution time spans midnight
MsgBox "Execution took " & Format(eTime / 86400, "hh:mm:ss") & " to process"
End Sub

Sub BuildWordList(StrIn As String)
Dim StrFnd As String, i As Long, j As Long, k As Long
'Define the exlusions list
Const StrExcl As String = "a,am,and,are,as,at,be,but,by,can,cm,did,do,does,eg," & _
          "en,eq,etc,for,get,go,got,has,have,he,her,him,how,i,ie,if,in,into,is," & _
          "it,its,me,mi,mm,my,na,nb,no,not,of,off,ok,on,one,or,our,out,re,she," & _
          "so,the,their,them,they,t,to,was,we,were,who,will,would,yd,you,your"
 'Strip out unwanted characters
For i = 1 To 255
  Select Case i
    Case 1 To 31, 33 To 64, 91 To 96, 123 To 144, 147 To 191, 247
    Do While InStr(StrIn, Chr(i)) > 0
      StrIn = Replace(StrIn, Chr(i), " ")
    Loop
  End Select
Next
'Convert smart single quotes to plain single quotes & delete any at the start/end of a word
StrIn = Replace(Replace(Replace(Replace(StrIn, Chr(145), "'"), Chr(146), "'"), "' ", " "), " '", " ")
'Convert to lowercase
StrIn = " " & LCase(StrIn) & " "
'Process the exclusions list
For i = 0 To UBound(Split(StrExcl, ","))
  StrFnd = " " & Split(StrExcl, ",")(i) & " "
  Do While InStr(StrIn, StrFnd) > 0
    StrIn = Replace(StrIn, " " & Split(StrExcl, ",")(i) & " ", " ")
  Loop
Next
'Clean up any duplicate spaces
Do While InStr(StrIn, "  ") > 0
  StrIn = Replace(StrIn, "  ", " ")
Loop
i = 0
Do While UBound(Split(StrIn, " ")) > 1
  StrFnd = " " & Split(StrIn, " ")(1) & " ": j = Len(StrIn)
  'Find how many occurences of each word there are in the document
  StrIn = Replace(StrIn, StrFnd, " ")
  k = (j - Len(StrIn)) / (Len(StrFnd) - 1)
  'If there's more than one occurence, add the word to our Find list
  If k > 1 Then
    ReDim Preserve ArrOut(i)
    ArrOut(i) = Trim(StrFnd)
    i = i + 1
  End If
Loop
WordBasic.SortArray ArrOut()
End Sub