在 JavaScript 中计算文档中单词的开始和结束位置

Calculate begin and end position of a word in a document in JavaScript

我有一个文本文档,表示为 array 个句子,对于每个句子,我有一个 array 个单词标记。

我必须为每个标记位置计算文档中标记位置的绝对开始和结束,因此如果在一个句子中我有五次 ipsum,我必须在那个位置得到正确的位置每次出现的句子。

这个函数是我写的

// calculate begin and end to each token in a sentence
function calculateTokenBeginEnd(textArray) {
  var currentText = [];
  textArray.sentences.forEach(function(sentence) {
    for (var i = 0; i < sentence.tokens.length; ++i) {
      var token = sentence.tokens[i];
      var word = token.word;
      if (i > 0) {
        var thisBegin = token.characterOffsetBegin;
        var previousEnd = sentence.tokens[i - 1].characterOffsetEnd;
        if (thisBegin > previousEnd) {
          currentText.push(' ');
        }
      }
      token.characterOffsetBegin = currentText.length;
      for (var j = 0; j < word.length; ++j) {
        currentText.push(word[j]);
      }
      token.characterOffsetEnd = currentText.length;
    }
    currentText.push('\n');
  });
  return textArray;
} //calculateTokenBeginEnd

但是有点不对劲。计算出来的characterOffsetBegincharacterOffsetEnd是错误的。 文档具有以下结构

{
    "sentences": [
        {
          "index": 0,
          "text": "Lorem ipsum dolor sit amet,",
          "tokens": [
            {
              "index": 1,
              "word": "Lorem",
              "characterOffsetBegin": 0,
              "characterOffsetEnd": 5
            },
            {
              "index": 2,
              "word": "ipsum",
              "characterOffsetBegin": 5,
              "characterOffsetEnd": 10
            },
    ...
          ]
        },
        {
          "index": 1,
          "text": " consectetur adipiscing elit,",
          "tokens": [
            {
              "index": 1,
              "word": "",
              "characterOffsetBegin": 24,
              "characterOffsetEnd": 24
            },
    ...
    }

这是使用此方法的示例。然后 calculateTokenBeginEnd 应该计算令牌开始和结束索引,而 text2SentencesTokens 创建上面的文档结构。 calculateTokenBeginEnd 没有按预期工作。

text = "Lorem ipsum dolor sit amet,\n consectetur adipiscing elit,\nsed do eiusmod tempor incididunt\nut labore et dolore magna aliqua.\nUt enim ad minim veniam,\nquis nostrud exercitation ullamco laboris nisi\nut aliquip ex ea commodo consequat.\nDuis aute irure dolor in reprehenderit in voluptate velit esse\ncillum dolore eu fugiat nulla pariatur.\nExcepteur sint occaecat cupidatat non proident,\nLorem ipsum dolor sit amet etwas,\nsunt in culpa qui officia deserunt mollit anim id est laborum"

// to map a text to sentences and tokens
text2SentencesTokens = function(text) {
  var self = this;
  return new Promise((resolve, _) => {
    let sentences = text.split(/\n+/g);
    let sentencesP = sentences.map((sentence, lineIndex) => { // for each sentence
      return new Promise((resolve, _) => {
        let tokens = sentence.split(/\s+/g);
        let tokensP = tokens.map((token, tokenIndex) => { // for each token
          let item = {
            "index": (tokenIndex + 1),
            "word": token
          }
          if (typeof(tokenP) == 'function') {
            return tokenP.apply(self, [item]);
          } else {
            return new Promise((resolve, _) => {
              resolve(item);
            });
          }
        });
        Promise.all(tokensP)
          .then(res => {
            resolve({
              index: lineIndex,
              text: sentence,
              tokens: res
            });
          })
          .catch(err => console.error(err))
      });
    });
    Promise.all(sentencesP)
      .then(res => {
        resolve({
          sentences: res
        })
      })
      .catch(err => console.error(err))
  });
} //text2SentencesTokens

// calculate begin and end to each token in a sentence
function calculateTokenBeginEnd(textArray) {
  var currentText = [];
  textArray.sentences.forEach(function(sentence) {
    for (var i = 0; i < sentence.tokens.length; ++i) {
      var token = sentence.tokens[i];
      var word = token.word;
      if (i > 0) {
        var thisBegin = token.characterOffsetBegin;
        var previousEnd = sentence.tokens[i - 1].characterOffsetEnd;
        if (thisBegin > previousEnd) {
          currentText.push(' ');
        }
      }
      token.characterOffsetBegin = currentText.length;
      for (var j = 0; j < word.length; ++j) {
        currentText.push(word[j]);
      }
      token.characterOffsetEnd = currentText.length;
    }
    currentText.push('\n');
  });
  return textArray;
} //calculateTokenBeginEnd

text2SentencesTokens(text)
  .then(sentences => {
    sentences = calculateTokenBeginEnd(sentences);
    console.log(sentences);

  })

[更新]

根据建议我重写了函数如下:

   function calculateTokenBeginEnd(textArray) {
        var wordStart=-1;
        for (var j = 0; j < textArray.sentences.length; ++j) {
            var sentence=textArray.sentences[j];
            wordStart +=1;
            for (var i = 0; i < sentence.tokens.length; ++i) {
                var token = sentence.tokens[i];
                var word = token.word;
                var wordRegex = new RegExp("\b(" + word + ")\b", "gi");
                var match = wordRegex.exec(sentence.text);
                var previousEnd = 0;
                wordStart += match.index + previousEnd;
                var wordEnd = wordStart + word.length - 1;
                token.characterOffsetBegin = wordStart;
                token.characterOffsetEnd = wordEnd;
            }
        }
    }//calculateTokenBeginEnd

有更好的解决方案吗?

[更新 2] 我已经根据建议的解决方案更新了 text2SentencesTokens。问题是,当一个或多个句子中有多个相同 token 的匹配项时,此解决方案将无法正常工作,因为它会用最后一个匹配的位置覆盖开始和结束位置,因此标记 down 这里会得到最后匹配的位置:

   {
      "index": 2,
      "word": "down",
      "characterOffsetBegin": 70,
      "characterOffsetEnd": 73
    }

在第一个句子的第一次出现,而它应该有第一个匹配的位置。

// convert a text document into a sentences array and a token array for each sentence
function text2SentencesTokens(text, tokenP) {
  var self = this;
  return new Promise((resolve, _) => {
    let sentences = text.split(/\n+/g);
    let sentencesP = sentences.map((sentence, lineIndex) => { // for each sentence
      return new Promise((resolve, _) => {
        let tokens = sentence.replace(/[\+;:\?!\»\«\>\<\]\[\)\(,\.\‘'“”"]/g, '').split(/\s+/g);
        let tokensP = tokens.map((token, tokenIndex) => { // for each token
          let item = {
            "index": (tokenIndex + 1),
            "word": token
          }
          var escaped = token.replace(/[\-\[\]{}()*+?.,\\^$|#\s]/g, "\$&");
          var wordRegex = RegExp("\b(" + escaped + ")\b", "g");
          var match = null;
          while ((match = wordRegex.exec(text)) !== null) {
            var wordStart = match.index;
            var wordEnd = wordStart + token.length - 1;
            item.characterOffsetBegin = wordStart;
            item.characterOffsetEnd = wordEnd;
          }

          if (typeof(tokenP) == 'function') {
            return tokenP.apply(self, [item, sentence]);
          } else {
            return new Promise((resolve, _) => {
              resolve(item);
            });
          }
        });
        Promise.all(tokensP)
          .then(res => {
            resolve({
              index: lineIndex,
              text: sentence,
              tokens: res
            });
          })
          .catch(err => console.error(err))
      });
    });
    Promise.all(sentencesP)
      .then(res => {
        resolve({
          sentences: res
        })
      })
      .catch(err => console.error(err))
  });
} //text2SentencesTokens

text = "Steve down walks warily down the street down\nWith the brim pulled way down low";
text2SentencesTokens(text)
  .then(res => console.log(JSON.stringify(res, null, 2)))

这可能是计算句子中单词 start/end 的一种更简单的方法,希望对您有所帮助

var word = "Lorem";
var reg = RegExp(word, 'g');
var sentence = "Lore ipsum Lorem dolor sit Lorem amet,";
var match;

console.log(sentence);
console.log(word);

while ((match = reg.exec(sentence)) !== null) {
  var wordStart = match.index;
  var wordEnd = wordStart + word.length - 1;
  console.log(wordStart + ' -start index');
  console.log(word.length + ' -length of word');
  console.log(wordEnd + ' -last character index, need to +1 to use with substring');
  console.log(sentence.substring(wordStart, wordEnd + 1) + '-using substring with calculated to find the word and verify');
}