我有一个文本文档表示为
array
每句话我都有一个
单词标记。
我必须为每个标记位置计算文档中标记位置的绝对开始和结束,因此如果在一个句子中
ipsum
五次,每次我都要在那句话中找到正确的位置。
// calculate begin and end to each token in a sentence
function calculateTokenBeginEnd(textArray) {
var currentText = [];
textArray.sentences.forEach(function(sentence) {
for (var i = 0; i < sentence.tokens.length; ++i) {
var token = sentence.tokens[i];
var word = token.word;
if (i > 0) {
var thisBegin = token.characterOffsetBegin;
var previousEnd = sentence.tokens[i - 1].characterOffsetEnd;
if (thisBegin > previousEnd) {
currentText.push(' ');
}
}
token.characterOffsetBegin = currentText.length;
for (var j = 0; j < word.length; ++j) {
currentText.push(word[j]);
}
token.characterOffsetEnd = currentText.length;
}
currentText.push('\n');
});
return textArray;
} //calculateTokenBeginEnd
但有点不对劲。计算的
characterOffsetBegin
characterOffsetEnd
你错了。
文档具有以下结构
{
"sentences": [
{
"index": 0,
"text": "Lorem ipsum dolor sit amet,",
"tokens": [
{
"index": 1,
"word": "Lorem",
"characterOffsetBegin": 0,
"characterOffsetEnd": 5
},
{
"index": 2,
"word": "ipsum",
"characterOffsetBegin": 5,
"characterOffsetEnd": 10
},
...
]
},
{
"index": 1,
"text": " consectetur adipiscing elit,",
"tokens": [
{
"index": 1,
"word": "",
"characterOffsetBegin": 24,
"characterOffsetEnd": 24
},
...
}
calculateTokenBeginEnd
然后应该计算标记的begin和end索引,而
text2SentencesTokens
创建了上面的文档结构。这个
计算开始时间
text = "Lorem ipsum dolor sit amet,\n consectetur adipiscing elit,\nsed do eiusmod tempor incididunt\nut labore et dolore magna aliqua.\nUt enim ad minim veniam,\nquis nostrud exercitation ullamco laboris nisi\nut aliquip ex ea commodo consequat.\nDuis aute irure dolor in reprehenderit in voluptate velit esse\ncillum dolore eu fugiat nulla pariatur.\nExcepteur sint occaecat cupidatat non proident,\nLorem ipsum dolor sit amet etwas,\nsunt in culpa qui officia deserunt mollit anim id est laborum"
// to map a text to sentences and tokens
text2SentencesTokens = function(text) {
var self = this;
return new Promise((resolve, _) => {
let sentences = text.split(/\n+/g);
let sentencesP = sentences.map((sentence, lineIndex) => { // for each sentence
return new Promise((resolve, _) => {
let tokens = sentence.split(/\s+/g);
let tokensP = tokens.map((token, tokenIndex) => { // for each token
let item = {
"index": (tokenIndex + 1),
"word": token
}
if (typeof(tokenP) == 'function') {
return tokenP.apply(self, [item]);
} else {
return new Promise((resolve, _) => {
resolve(item);
});
}
});
Promise.all(tokensP)
.then(res => {
resolve({
index: lineIndex,
text: sentence,
tokens: res
});
})
.catch(err => console.error(err))
});
});
Promise.all(sentencesP)
.then(res => {
resolve({
sentences: res
})
})
.catch(err => console.error(err))
});
} //text2SentencesTokens
// calculate begin and end to each token in a sentence
function calculateTokenBeginEnd(textArray) {
var currentText = [];
textArray.sentences.forEach(function(sentence) {
for (var i = 0; i < sentence.tokens.length; ++i) {
var token = sentence.tokens[i];
var word = token.word;
if (i > 0) {
var thisBegin = token.characterOffsetBegin;
var previousEnd = sentence.tokens[i - 1].characterOffsetEnd;
if (thisBegin > previousEnd) {
currentText.push(' ');
}
}
token.characterOffsetBegin = currentText.length;
for (var j = 0; j < word.length; ++j) {
currentText.push(word[j]);
}
token.characterOffsetEnd = currentText.length;
}
currentText.push('\n');
});
return textArray;
} //calculateTokenBeginEnd
text2SentencesTokens(text)
.then(sentences => {
sentences = calculateTokenBeginEnd(sentences);
console.log(sentences);
})
[更新]
根据建议,我将函数重写如下:
function calculateTokenBeginEnd(textArray) {
var wordStart=-1;
for (var j = 0; j < textArray.sentences.length; ++j) {
var sentence=textArray.sentences[j];
wordStart +=1;
for (var i = 0; i < sentence.tokens.length; ++i) {
var token = sentence.tokens[i];
var word = token.word;
var wordRegex = new RegExp("\\b(" + word + ")\\b", "gi");
var match = wordRegex.exec(sentence.text);
var previousEnd = 0;
wordStart += match.index + previousEnd;
var wordEnd = wordStart + word.length - 1;
token.characterOffsetBegin = wordStart;
token.characterOffsetEnd = wordEnd;
}
}
}//calculateTokenBeginEnd
有更好的解决办法吗?
我已经更新了
根据提出的解决方案。问题是,当存在多个相同的匹配项时,此解决方案将无法正常工作
token
在一个或多个句子中,因为它会用最后一个匹配的位置覆盖开始和结束位置,所以标记
down
这里将获得最后匹配的位置:
{
"index": 2,
"word": "down",
"characterOffsetBegin": 70,
"characterOffsetEnd": 73
}
在第一句话的第一次出现时,它本应该有第一个匹配的位置。
// convert a text document into a sentences array and a token array for each sentence
function text2SentencesTokens(text, tokenP) {
var self = this;
return new Promise((resolve, _) => {
let sentences = text.split(/\n+/g);
let sentencesP = sentences.map((sentence, lineIndex) => { // for each sentence
return new Promise((resolve, _) => {
let tokens = sentence.replace(/[\\+;:\?!\»\«\>\<\]\[\)\(,\.\â'ââ"]/g, '').split(/\s+/g);
let tokensP = tokens.map((token, tokenIndex) => { // for each token
let item = {
"index": (tokenIndex + 1),
"word": token
}
var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
var wordRegex = RegExp("\\b(" + escaped + ")\\b", "g");
var match = null;
while ((match = wordRegex.exec(text)) !== null) {
var wordStart = match.index;
var wordEnd = wordStart + token.length - 1;
item.characterOffsetBegin = wordStart;
item.characterOffsetEnd = wordEnd;
}
if (typeof(tokenP) == 'function') {
return tokenP.apply(self, [item, sentence]);
} else {
return new Promise((resolve, _) => {
resolve(item);
});
}
});
Promise.all(tokensP)
.then(res => {
resolve({
index: lineIndex,
text: sentence,
tokens: res
});
})
.catch(err => console.error(err))
});
});
Promise.all(sentencesP)
.then(res => {
resolve({
sentences: res
})
})
.catch(err => console.error(err))
});
} //text2SentencesTokens
text = "Steve down walks warily down the street down\nWith the brim pulled way down low";
text2SentencesTokens(text)
.then(res => console.log(JSON.stringify(res, null, 2)))