要索引的 Azure Blob 存储 blob
Azure BlobStorage blobs to Index
是否可以将文档上传到 blob 存储并执行以下操作:
- 获取文档内容并添加到索引。
- 从第 1 点的内容中抓取关键短语并添加到索引中。
我希望关键短语可以搜索到。
我有可以将文档上传到 blobstorage 的代码,它工作得很好,但是(据我所知)获取此索引的唯一方法是使用 Azure 搜索服务中的 "Import Data",它创建和带有预定义字段的索引 - 如下所示:
这在只需要这些字段并且索引每 5 分钟自动更新一次时效果很好。但是当我想要一个自定义索引时就成了一个问题
但是,我唯一想要的字段如下:
- 文件编号
- fileText(这是文档的内容)
- blobURL(允许下载文档)
- keyPhrases(要从 fileText 中提取 - 我也有执行此操作的代码)
我遇到的唯一问题是我需要能够检索文档内容 (fileText) 才能获取关键短语,但据我所知,只有在文档内容已经存在的情况下我才能这样做我访问该内容的索引?
我对 Azure 的了解非常有限,很难找到与我想做的类似的事情。
我用来将文档上传到我的 blob 存储的代码如下:
public CloudBlockBlob UploadBlob(HttpPostedFileBase file)
{
string searchServiceName = ConfigurationManager.AppSettings["SearchServiceName"];
string blobStorageKey = ConfigurationManager.AppSettings["BlobStorageKey"];
string blobStorageName = ConfigurationManager.AppSettings["BlobStorageName"];
string blobStorageURL = ConfigurationManager.AppSettings["BlobStorageURL"];
string UserID = User.Identity.GetUserId();
string UploadDateTime = DateTime.Now.ToString("yyyyMMddhhmmss").ToString();
try
{
var path = Path.Combine(Server.MapPath("~/App_Data/Uploads"), UserID + "_" + UploadDateTime + "_" + file.FileName);
file.SaveAs(path);
var credentials = new StorageCredentials(searchServiceName, blobStorageKey);
var client = new CloudBlobClient(new Uri(blobStorageURL), credentials);
// Retrieve a reference to a container. (You need to create one using the mangement portal, or call container.CreateIfNotExists())
var container = client.GetContainerReference(blobStorageName);
// Retrieve reference to a blob named "myfile.gif".
var blockBlob = container.GetBlockBlobReference(UserID + "_" + UploadDateTime + "_" + file.FileName);
// Create or overwrite the "myblob" blob with contents from a local file.
using (var fileStream = System.IO.File.OpenRead(path))
{
blockBlob.UploadFromStream(fileStream);
}
System.IO.File.Delete(path);
return blockBlob;
}
catch (Exception e)
{
var r = e.Message;
return null;
}
}
我希望我没有提供太多信息,但我不知道还有什么可以解释我正在寻找的东西。如果我说的不对,请告诉我,以便我解决问题。
我不是在寻找讲义代码,只是在寻找正确方向的推动。
如有任何帮助,我将不胜感激。
谢谢!
我们可以使用 Azure 搜索通过 Azure 搜索对文档进行索引 REST API and .NET SDK。
根据您的描述,我用.NET SDK创建了一个demo并测试成功。以下是我的详细步骤:
- 从 Azure 门户创建 Azure 搜索
- 从 Azure 门户获取搜索键
创建自定义索引字段模型
[SerializePropertyNamesAsCamelCase]
public class TomTestModel
{
[Key]
[IsFilterable]
public string fileId { get; set; }
[IsSearchable]
public string fileText { get; set; }
public string blobURL { get; set; }
[IsSearchable]
public string keyPhrases { get; set; }
}
4.Create数据源
string searchServiceName = ConfigurationManager.AppSettings["SearchServiceName"];
string adminApiKey = ConfigurationManager.AppSettings["SearchServiceAdminApiKey"];
SearchServiceClient serviceClient = new SearchServiceClient(searchServiceName, new SearchCredentials(adminApiKey));
var dataSource = DataSource.AzureBlobStorage("storage name", "connectstrong", "container name");
//create data source
if (serviceClient.DataSources.Exists(dataSource.Name))
{
serviceClient.DataSources.Delete(dataSource.Name);
}
serviceClient.DataSources.Create(dataSource);
- 创建自定义索引
var definition = new Index()
{
Name = "tomcustomindex",
Fields = FieldBuilder.BuildForType<TomTestModel>()
};
//create Index
if (serviceClient.Indexes.Exists(definition.Name))
{
serviceClient.Indexes.Delete(definition.Name);
}
var index = serviceClient.Indexes.Create(definition);
上传文档到索引,更多使用SDK操作存储请参考document
CloudStorageAccount storageAccount = CloudStorageAccount.Parse("connection string");
var blobClient = storageAccount.CreateCloudBlobClient();
var container =blobClient.GetContainerReference("container name");
var blobList = container.ListBlobs();
var tomIndexsList = blobList.Select(blob => new TomTestModel
{
fileId = Guid.NewGuid().ToString(), blobURL = blob.Uri.ToString(), fileText = "Blob Content", keyPhrases = "key phrases",
}).ToList();
var batch = IndexBatch.Upload(tomIndexsList);
ISearchIndexClient indexClient = serviceClient.Indexes.GetClient("index");
indexClient.Documents.Index(batch);
查看搜索探索的搜索结果。
Page.config 文件:
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="Microsoft.Azure.KeyVault.Core" version="1.0.0" targetFramework="net452" />
<package id="Microsoft.Azure.Search" version="3.0.0-rc" targetFramework="net452" />
<package id="Microsoft.Data.Edm" version="5.6.4" targetFramework="net452" />
<package id="Microsoft.Data.OData" version="5.6.4" targetFramework="net452" />
<package id="Microsoft.Data.Services.Client" version="5.6.4" targetFramework="net452" />
<package id="Microsoft.Rest.ClientRuntime" version="2.3.4" targetFramework="net452" />
<package id="Microsoft.Rest.ClientRuntime.Azure" version="3.3.4" targetFramework="net452" />
<package id="Microsoft.Spatial" version="6.15.0" targetFramework="net452" />
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net452" />
<package id="System.Spatial" version="5.6.4" targetFramework="net452" />
<package id="WindowsAzure.Storage" version="7.2.1" targetFramework="net452" />
</packages>
TomTestModel 文件:
using System.ComponentModel.DataAnnotations;
using Microsoft.Azure.Search;
using Microsoft.Azure.Search.Models;
namespace TomAzureSearchTest
{
[SerializePropertyNamesAsCamelCase]
public class TomTestModel
{
[Key]
[IsFilterable]
public string fileId { get; set; }
[IsSearchable]
public string fileText { get; set; }
public string blobURL { get; set; }
[IsSearchable]
public string keyPhrases { get; set; }
}
}
是否可以将文档上传到 blob 存储并执行以下操作:
- 获取文档内容并添加到索引。
- 从第 1 点的内容中抓取关键短语并添加到索引中。
我希望关键短语可以搜索到。
我有可以将文档上传到 blobstorage 的代码,它工作得很好,但是(据我所知)获取此索引的唯一方法是使用 Azure 搜索服务中的 "Import Data",它创建和带有预定义字段的索引 - 如下所示:
这在只需要这些字段并且索引每 5 分钟自动更新一次时效果很好。但是当我想要一个自定义索引时就成了一个问题
但是,我唯一想要的字段如下:
- 文件编号
- fileText(这是文档的内容)
- blobURL(允许下载文档)
- keyPhrases(要从 fileText 中提取 - 我也有执行此操作的代码)
我遇到的唯一问题是我需要能够检索文档内容 (fileText) 才能获取关键短语,但据我所知,只有在文档内容已经存在的情况下我才能这样做我访问该内容的索引?
我对 Azure 的了解非常有限,很难找到与我想做的类似的事情。
我用来将文档上传到我的 blob 存储的代码如下:
public CloudBlockBlob UploadBlob(HttpPostedFileBase file)
{
string searchServiceName = ConfigurationManager.AppSettings["SearchServiceName"];
string blobStorageKey = ConfigurationManager.AppSettings["BlobStorageKey"];
string blobStorageName = ConfigurationManager.AppSettings["BlobStorageName"];
string blobStorageURL = ConfigurationManager.AppSettings["BlobStorageURL"];
string UserID = User.Identity.GetUserId();
string UploadDateTime = DateTime.Now.ToString("yyyyMMddhhmmss").ToString();
try
{
var path = Path.Combine(Server.MapPath("~/App_Data/Uploads"), UserID + "_" + UploadDateTime + "_" + file.FileName);
file.SaveAs(path);
var credentials = new StorageCredentials(searchServiceName, blobStorageKey);
var client = new CloudBlobClient(new Uri(blobStorageURL), credentials);
// Retrieve a reference to a container. (You need to create one using the mangement portal, or call container.CreateIfNotExists())
var container = client.GetContainerReference(blobStorageName);
// Retrieve reference to a blob named "myfile.gif".
var blockBlob = container.GetBlockBlobReference(UserID + "_" + UploadDateTime + "_" + file.FileName);
// Create or overwrite the "myblob" blob with contents from a local file.
using (var fileStream = System.IO.File.OpenRead(path))
{
blockBlob.UploadFromStream(fileStream);
}
System.IO.File.Delete(path);
return blockBlob;
}
catch (Exception e)
{
var r = e.Message;
return null;
}
}
我希望我没有提供太多信息,但我不知道还有什么可以解释我正在寻找的东西。如果我说的不对,请告诉我,以便我解决问题。
我不是在寻找讲义代码,只是在寻找正确方向的推动。
如有任何帮助,我将不胜感激。
谢谢!
我们可以使用 Azure 搜索通过 Azure 搜索对文档进行索引 REST API and .NET SDK。 根据您的描述,我用.NET SDK创建了一个demo并测试成功。以下是我的详细步骤:
- 从 Azure 门户创建 Azure 搜索
- 从 Azure 门户获取搜索键
创建自定义索引字段模型
[SerializePropertyNamesAsCamelCase] public class TomTestModel { [Key] [IsFilterable] public string fileId { get; set; } [IsSearchable] public string fileText { get; set; } public string blobURL { get; set; } [IsSearchable] public string keyPhrases { get; set; } }
4.Create数据源
string searchServiceName = ConfigurationManager.AppSettings["SearchServiceName"];
string adminApiKey = ConfigurationManager.AppSettings["SearchServiceAdminApiKey"];
SearchServiceClient serviceClient = new SearchServiceClient(searchServiceName, new SearchCredentials(adminApiKey));
var dataSource = DataSource.AzureBlobStorage("storage name", "connectstrong", "container name");
//create data source
if (serviceClient.DataSources.Exists(dataSource.Name))
{
serviceClient.DataSources.Delete(dataSource.Name);
}
serviceClient.DataSources.Create(dataSource);
- 创建自定义索引
var definition = new Index()
{
Name = "tomcustomindex",
Fields = FieldBuilder.BuildForType<TomTestModel>()
};
//create Index
if (serviceClient.Indexes.Exists(definition.Name))
{
serviceClient.Indexes.Delete(definition.Name);
}
var index = serviceClient.Indexes.Create(definition);
上传文档到索引,更多使用SDK操作存储请参考document
CloudStorageAccount storageAccount = CloudStorageAccount.Parse("connection string"); var blobClient = storageAccount.CreateCloudBlobClient(); var container =blobClient.GetContainerReference("container name"); var blobList = container.ListBlobs(); var tomIndexsList = blobList.Select(blob => new TomTestModel { fileId = Guid.NewGuid().ToString(), blobURL = blob.Uri.ToString(), fileText = "Blob Content", keyPhrases = "key phrases", }).ToList(); var batch = IndexBatch.Upload(tomIndexsList); ISearchIndexClient indexClient = serviceClient.Indexes.GetClient("index"); indexClient.Documents.Index(batch);
查看搜索探索的搜索结果。
Page.config 文件:
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="Microsoft.Azure.KeyVault.Core" version="1.0.0" targetFramework="net452" />
<package id="Microsoft.Azure.Search" version="3.0.0-rc" targetFramework="net452" />
<package id="Microsoft.Data.Edm" version="5.6.4" targetFramework="net452" />
<package id="Microsoft.Data.OData" version="5.6.4" targetFramework="net452" />
<package id="Microsoft.Data.Services.Client" version="5.6.4" targetFramework="net452" />
<package id="Microsoft.Rest.ClientRuntime" version="2.3.4" targetFramework="net452" />
<package id="Microsoft.Rest.ClientRuntime.Azure" version="3.3.4" targetFramework="net452" />
<package id="Microsoft.Spatial" version="6.15.0" targetFramework="net452" />
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net452" />
<package id="System.Spatial" version="5.6.4" targetFramework="net452" />
<package id="WindowsAzure.Storage" version="7.2.1" targetFramework="net452" />
</packages>
TomTestModel 文件:
using System.ComponentModel.DataAnnotations;
using Microsoft.Azure.Search;
using Microsoft.Azure.Search.Models;
namespace TomAzureSearchTest
{
[SerializePropertyNamesAsCamelCase]
public class TomTestModel
{
[Key]
[IsFilterable]
public string fileId { get; set; }
[IsSearchable]
public string fileText { get; set; }
public string blobURL { get; set; }
[IsSearchable]
public string keyPhrases { get; set; }
}
}