如何解析唯一记录及其行索引

How to parse unique records and their row index

我有一组文件,有些没有后缀,有些有不同的后缀。我想隔离文件名而不考虑它们的后缀,并将它们连同它们在同一个电子表格中的行索引范围一起列出。下面是示例和我失败的代码。还附上了电子表格快照。你能帮忙吗?欢迎任何新的 code/logic。

输入:

Row index Filename
1 File1
2 File2_a
3 File2_b
4 File2_c
5 File3_a
6 File3_b

输出:

Filename Row indices range
File1 1 1
File2 2 4
File3 5 6

VBA代码

Sub GetUniqueFiles()

    Dim sh As Worksheet
    Set sh = ThisWorkbook.Sheets("Sheet1")
            
    lastrow = sh1.Range("A" & Rows.Count).End(xlUp).Row
    SameFile = False ' Flag to compare 2 consecutive file names
    i = 3: j = 3
    While i <= (lastrow - 1)
            name_curt = sh.Range("B" & i).Value
            name_next = sh.Range("B" & i + 1).Value
            file_curt = Split(name_curt, "_")(0)
            file_next = Split(name_next, "_")(0)
                        
            If file_curt <> file_next Then
                sh.Range("D" & j).Value = file_curt
                k1 = i
                sh.Range("E" & j).Value = k1
                sh.Range("F" & j).Value = k2
                i = i + 1: j = j + 1
            ElseIf file_curt = file_next Then
                SameFile = True
                sh.Range("B" & j).Value = file_curt
                k1 = i
                While SameFile
                    i = i + 1
                    name_curt = sh.Range("B" & i).Value
                    name_next = sh.Range("B" & i + 1).Value
                    file_curt = Split(name_curt, "_")(0)
                    file_next = Split(name_next, "_")(0)
                Wend
          
            End If
    Wend
    
End Sub

不用VBA也能解决这个问题。在 C 列中添加类似这样的内容:

=LEFT(B2,IFERROR(FIND("_",B2)-1,LEN(B2)+1))

它将删除下划线及其后的所有内容。下一步是计算不同的值。我会选择一个支点 table,但还有很多其他方法。

试试这个:

Sub GetUniqueFiles()

    Dim sh As Worksheet, m, indx, rw As Range, f As String
    Dim r As Long
    
    Set sh = ThisWorkbook.Sheets("Sheet1")
    Set rw = sh.Range("A3:B3")                              'first input row
    
    r = 2 'start row for output
    Do While Application.CountA(rw) = 2                     'loop while have data
        indx = rw.Cells(1).Value
        f = Split(rw.Cells(2).Value, "_")(0)                ' "base" file name
        m = Application.Match(f, sh.Columns("D"), 0)        'see if already listed
        If IsError(m) Then                                  'not already listed ?
            sh.Cells(r, "D").Value = f                      'write file name
            sh.Cells(r, "E").Value = indx                   'write "first seen" index
            m = r
            r = r + 1
        End If
        sh.Cells(m, "F").Value = indx                       'write "last seen" index
        Set rw = rw.Offset(1, 0)                            'next input row
    Loop
End Sub

您可以使用 Power Query 获得所需的输出,在 Windows Excel 2010+ 和 Office 365 Excel

中可用
  • Select 你原来的一些单元格 table
  • Data => Get&Transform => From Table/RangeFrom within sheet
  • 当 PQ UI 打开时,导航至 Home => Advanced Editor
  • 记下代码第 2 行中的 Table 名称。
  • 将现有代码替换为下面的M-Code
  • 将粘贴代码第 2 行中的 table 名称更改为您的“真实”table 名称
  • 检查任何评论,以及 Applied Steps window,以更好地理解算法和步骤

M码

let

//Change table name in next line to actual table name in your workbook
    Source = Excel.CurrentWorkbook(){[Name="Table22"]}[Content],

//split on the underscore and remove the splitted suffix
    #"Split Column by Delimiter" = Table.SplitColumn(Source, "Filename", 
        Splitter.SplitTextByDelimiter("_", QuoteStyle.Csv), {"Filename", "Filename.2"}),

//set data types -- frequently a good idea in PQ
    #"Changed Type1" = Table.TransformColumnTypes(#"Split Column by Delimiter",{
        {"Filename", type text}, {"Filename.2", type text}}),
    #"Removed Columns" = Table.RemoveColumns(#"Changed Type1",{"Filename.2"}),

//Group by file name and extract the lowest and highest rows
    #"Grouped Rows" = Table.Group(#"Removed Columns", {"Filename"}, {
        {"Start Row", each List.Min([Row index]), type number}, 
        {"End Row", each List.Max([Row index]), type number}})
in
    #"Grouped Rows"

为了提高执行速度和节约资源,最好尽量减少与 VBA 中的工作表的交互。例如,无论文件列表有多长,以下内容都精确地引用了两次工作表。不要低估 VBA.

限制工作表交互的价值
Sub GetUniqueFiles()
    Dim c&, i&, a$, b$, vIn, vOut

    Const FILES_IN$ = "b3"
    Const FILES_OUT$ = "d3"
    
    With Range(FILES_IN)
        vIn = .Resize(.End(xlDown).Row - .Row + 1)
    End With
    ReDim vOut(1 To UBound(vIn), 1 To 3)
        
    For i = 1 To UBound(vIn)
        b = Split(vIn(i, 1), "_")(0)
        If a <> b Then
            a = b
            c = c + 1
            vOut(c, 1) = b
            vOut(c, 2) = i
            If c > 1 Then vOut(c - 1, 3) = i - 1
        End If
    Next
    If c > 1 Then vOut(c, 3) = i - 1
    
    Range(FILES_OUT).Resize(UBound(vIn), 3) = vOut
End Sub