Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 101 additions & 72 deletions src/main/java/org/grobid/core/engines/SoftwareParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import nu.xom.Attribute;
import nu.xom.Element;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.collections4.IteratorUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
Expand Down Expand Up @@ -215,8 +216,7 @@ public List<SoftwareEntity> processText(String text, boolean disambiguate) throw
List<LayoutToken> tokens = SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(text);
if (CollectionUtils.isEmpty(tokens)) {
return null;
} else if (tokens.size() > 512 && softwareConfiguration.getModel("software").engine.equals("delft")) {
// } else if (tokens.size() > 512) {
} else if (needToSplitInSentences(tokens)) {
String tempText = LayoutTokensUtil.toText(tokens);
List<OffsetPosition> offsetPositions = SentenceUtilities.getInstance().runSentenceDetection(tempText);
List<List<LayoutToken>> outputTokens = offsetPositions.stream()
Expand Down Expand Up @@ -346,6 +346,16 @@ public List<SoftwareEntity> processText(String text, boolean disambiguate) throw
return entities;
}

private boolean needToSplitInSentences(List<LayoutToken> tokens) {
return tokens.size() > 512 && isSequenceLabellingUsingDL();
}

private boolean isSequenceLabellingUsingDL() {
// return true;
return softwareConfiguration.getModel("software").engine.equals("delft")
|| softwareConfiguration.getModel("software-type").engine.equals("delft");
}

/**
* Extract all Software mentions from a pdf file
*/
Expand Down Expand Up @@ -825,14 +835,14 @@ public Pair<List<SoftwareEntity>, Document> processPDF(File file,
*/
private List<SoftwareEntity> processLayoutTokenSequence(
List<LayoutToken> layoutTokens,
List<SoftwareEntity> entities,
boolean disambiguate,
boolean addParagraphContext,
boolean fromPDF,
boolean fromXML,
List<PDFAnnotation> pdfAnnotations
List<SoftwareEntity> entities,
boolean disambiguate,
boolean addParagraphContext,
boolean fromPDF,
boolean fromXML,
List<PDFAnnotation> pdfAnnotations
) {
List<LayoutTokenization> layoutTokenizations = new ArrayList<LayoutTokenization>();
List<LayoutTokenization> layoutTokenizations = new ArrayList<>();
layoutTokenizations.add(new LayoutTokenization(layoutTokens));
return processLayoutTokenSequences(layoutTokenizations, entities, disambiguate, addParagraphContext, fromPDF, fromXML, pdfAnnotations);
}
Expand All @@ -842,11 +852,12 @@ private List<SoftwareEntity> processLayoutTokenSequence(
*/
private List<SoftwareEntity> processLayoutTokenSequenceMultiple(
List<List<LayoutToken>> layoutTokenList,
List<SoftwareEntity> entities,
boolean disambiguate,
boolean addParagraphContext,
boolean fromPDF,
boolean fromXML) {
List<SoftwareEntity> entities,
boolean disambiguate,
boolean addParagraphContext,
boolean fromPDF,
boolean fromXML
) {

return processLayoutTokenSequenceMultiple(
layoutTokenList,
Expand All @@ -868,7 +879,7 @@ private List<SoftwareEntity> processLayoutTokenSequenceMultiple(
boolean fromXML,
List<PDFAnnotation> pdfAnnotations
) {
List<LayoutTokenization> layoutTokenizations = new ArrayList<LayoutTokenization>();
List<LayoutTokenization> layoutTokenizations = new ArrayList<>();
for (List<LayoutToken> layoutTokens : layoutTokenList)
layoutTokenizations.add(new LayoutTokenization(layoutTokens));
return processLayoutTokenSequences(layoutTokenizations, entities, disambiguate, addParagraphContext, fromPDF, fromXML, pdfAnnotations);
Expand All @@ -891,7 +902,7 @@ private List<SoftwareEntity> processLayoutTokenSequences(
List<LayoutToken> layoutTokens = layoutTokenization.getTokenization();
layoutTokens = SoftwareAnalyzer.getInstance().retokenizeLayoutTokens(layoutTokens);

if ((layoutTokens == null) || (layoutTokens.size() == 0))
if (CollectionUtils.isEmpty(layoutTokens))
continue;

// positions for lexical match
Expand All @@ -910,7 +921,7 @@ private List<SoftwareEntity> processLayoutTokenSequences(

// labeled result from sequence labelling lib
String allResString = allRess.toString();
if (allResString.trim().length() == 0) {
if (StringUtils.isBlank(allResString)) {
// empty content, nothing more to do
return entities;
}
Expand All @@ -922,7 +933,7 @@ private List<SoftwareEntity> processLayoutTokenSequences(
List<LayoutToken> layoutTokens = layoutTokenization.getTokenization();
layoutTokens = SoftwareAnalyzer.getInstance().retokenizeLayoutTokens(layoutTokens);

if ((layoutTokens == null) || (layoutTokens.size() == 0))
if (CollectionUtils.isEmpty(layoutTokens))
continue;

// text of the selected segment
Expand All @@ -933,8 +944,9 @@ private List<SoftwareEntity> processLayoutTokenSequences(

String localRes = resBlocks[l];

if (localRes == null || localRes.length() == 0)
if (StringUtils.isBlank(localRes)) {
continue;
}

List<SoftwareComponent> components = extractSoftwareComponents(text, localRes, layoutTokens);
l++;
Expand Down Expand Up @@ -968,10 +980,10 @@ private List<SoftwareEntity> processLayoutTokenSequences(
//text = LayoutTokensUtil.normalizeDehyphenizeText(layoutTokens);

// finally refine software types, if there is anything to refine
if (localEntities.size() > 0) {
if (CollectionUtils.isNotEmpty(localEntities)) {
try {
List<SoftwareType> entityTypes = softwareTypeParser.processFeatureInput(text, localRes, layoutTokens);
if (entityTypes != null && entityTypes.size() > 0) {
if (CollectionUtils.isNotEmpty(entityTypes)) {
localEntities = refineTypes(localEntities, entityTypes);
Collections.sort(localEntities);
}
Expand Down Expand Up @@ -1066,15 +1078,17 @@ private List<SoftwareEntity> processLayoutTokenSequenceTableFigure(
return entities;
}

public List<SoftwareEntity> propagateLayoutTokenSequence(List<LayoutToken> layoutTokens,
List<SoftwareEntity> entities,
Map<String, Double> termProfiles,
FastMatcher termPattern,
List<OffsetPosition> placeTaken,
Map<String, Integer> frequencies,
boolean addParagraphContext,
boolean fromPDF,
boolean fromXML) {
public List<SoftwareEntity> propagateLayoutTokenSequence(
List<LayoutToken> layoutTokens,
List<SoftwareEntity> entities,
Map<String, Double> termProfiles,
FastMatcher termPattern,
List<OffsetPosition> placeTaken,
Map<String, Integer> frequencies,
boolean addParagraphContext,
boolean fromPDF,
boolean fromXML
) {
// possible offset of the sequence in the complete document tokenization
int offsetShift = 0;
if (layoutTokens != null && layoutTokens.size() > 0 && layoutTokens.get(0).getOffset() != 0) {
Expand Down Expand Up @@ -1813,23 +1827,25 @@ public String addFeatures(List<LayoutToken> tokens,
/**
* Add to the entities their local text contexts: the sentence where the software name occurs.
*/
public List<SoftwareEntity> addContext(List<SoftwareEntity> entities,
String text,
List<LayoutToken> tokens,
boolean fromPDF,
boolean fromXML,
boolean addParagraphContext) {
public List<SoftwareEntity> addContext(
List<SoftwareEntity> entities,
String text,
List<LayoutToken> tokens,
boolean fromPDF,
boolean fromXML,
boolean addParagraphContext
) {
// adjust offsets if tokenization does not start at 0
int offsetShift = 0;
if (tokens != null && tokens.size() > 0 && tokens.get(0).getOffset() != 0) {
if (CollectionUtils.isNotEmpty(tokens) && tokens.get(0).getOffset() != 0) {
offsetShift = tokens.get(0).getOffset();
}

//System.out.println("offsetShift: " + offsetShift);

// we start by segmenting the tokenized text into sentences

List<OffsetPosition> forbidden = new ArrayList<OffsetPosition>();
List<OffsetPosition> forbidden = new ArrayList<>();
// fill the position where entities occur to avoid segmenting in the middle of a software string, same for
// the reference marker positions too
for (SoftwareEntity entity : entities) {
Expand Down Expand Up @@ -2454,9 +2470,11 @@ public int boostrapTrainingPDF(String inputDirectory,
/**
* Extract all software mentions from a publisher XML file
*/
public Pair<List<SoftwareEntity>, List<BibDataSet>> processXML(File file,
boolean disambiguate,
boolean addParagraphContext) throws IOException {
public Pair<List<SoftwareEntity>, List<BibDataSet>> processXML(
File file,
boolean disambiguate,
boolean addParagraphContext
) throws IOException {
Pair<List<SoftwareEntity>, List<BibDataSet>> resultExtraction = null;
try {
String tei = processXML(file);
Expand Down Expand Up @@ -2484,9 +2502,11 @@ public Pair<List<SoftwareEntity>, List<BibDataSet>> processXML(File file,
/**
* Extract all software mentions from a publisher XML file
*/
public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEI(File file,
boolean disambiguate,
boolean addParagraphContext) throws IOException {
public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEI(
File file,
boolean disambiguate,
boolean addParagraphContext
) throws IOException {
Pair<List<SoftwareEntity>, List<BibDataSet>> resultExtraction = null;
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
Expand Down Expand Up @@ -2546,9 +2566,11 @@ public String processXML(File file) throws Exception {
/**
* Extract all software mentions from a publisher XML file
*/
public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(org.w3c.dom.Document doc,
boolean disambiguate,
boolean addParagraphContext) {
public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(
org.w3c.dom.Document doc,
boolean disambiguate,
boolean addParagraphContext
) {
List<SoftwareEntity> entities = new ArrayList<>();

List<List<LayoutToken>> selectedLayoutTokenSequencesRaw = new ArrayList<>();
Expand Down Expand Up @@ -2577,9 +2599,9 @@ public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(org.w3c.d
if (StringUtils.isNotBlank(contentText)) {
List<LayoutToken> paragraphTokens =
SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(contentText);
String orginalText = UnicodeUtil.normaliseText(paragraphElement.getTextContent());
String originalText = UnicodeUtil.normaliseText(paragraphElement.getTextContent());
List<LayoutToken> originalParagraphTokens =
SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(orginalText);
SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(originalText);
if (CollectionUtils.isNotEmpty(paragraphTokens)) {
// shift the paragraph tokens to the global position
for (LayoutToken paragraphToken : paragraphTokens) {
Expand All @@ -2600,28 +2622,35 @@ public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(org.w3c.d
}
}

logger.warn("Number of sequences too large for the BERT model: "
+ selectedLayoutTokenSequencesRaw.stream().filter(s -> s.size() > 512).count());
List<List<LayoutToken>> selectedLayoutTokenSequences = new ArrayList<>(selectedLayoutTokenSequencesRaw);

// Run the sentence segmenter for sequences that are larger than 512 tokens, what the BERT model can handle.
List<List<LayoutToken>> selectedLayoutTokenSequences = selectedLayoutTokenSequencesRaw.stream()
.flatMap(tokens -> {
if (tokens.size() > 512) {
String tempText = LayoutTokensUtil.toText(tokens);
List<OffsetPosition> offsetPositions = SentenceUtilities.getInstance().runSentenceDetection(tempText);
List<List<LayoutToken>> splitTokens =offsetPositions.stream()
.map(op -> tempText.substring(op.start, op.end))
.map(s -> SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(s))
.collect(Collectors.toList());
return splitTokens.stream();
}
return Stream.of(tokens);
}
).collect(Collectors.toList());
if (isSequenceLabellingUsingDL()) {
logger.warn("Number of sequences too large for the BERT model: "
+ selectedLayoutTokenSequencesRaw.stream().filter(s -> s.size() > 512).count());

// Run the sentence segmenter for sequences that are larger than 512 tokens, what the BERT model can handle.
List<List<LayoutToken>> tempSequences = new ArrayList<>();
for (List<LayoutToken> tokens : selectedLayoutTokenSequencesRaw) {
if (tokens.size() > 512) {
// Split large sequences into sentences
String tempText = LayoutTokensUtil.toText(tokens);
int offset = tokens.get(0).getOffset();
List<OffsetPosition> offsetPositions = SentenceUtilities.getInstance().runSentenceDetection(tempText);

logger.warn("Number of sequences too large for the BERT model: "
+ selectedLayoutTokenSequences.stream().filter(s -> s.size() > 512).count());
for (OffsetPosition op : offsetPositions) {
String sentenceText = tempText.substring(op.start, op.end);
List<LayoutToken> sentenceTokens = SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(sentenceText);
sentenceTokens.stream().forEach(token -> token.setOffset(token.getOffset() + offset + op.start));

tempSequences.add(sentenceTokens);
}
} else {
// Keep small sequences as-is
tempSequences.add(tokens);
}
}
selectedLayoutTokenSequences = tempSequences;
}

processLayoutTokenSequenceMultiple(selectedLayoutTokenSequences, entities, disambiguate, addParagraphContext, false, true);
selectedLayoutTokenSequences = selectedOriginalLayoutTokenSequences;
Expand All @@ -2633,7 +2662,7 @@ public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(org.w3c.d
String context = entity.getContext();
if (context == null)
continue;
String oldContext = new String(entity.getContext());
String oldContext = entity.getContext();

int paragraphContextOffset = entity.getParagraphContextOffset();
int globalContextOffset = entity.getGlobalContextOffset();
Expand All @@ -2645,7 +2674,7 @@ public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(org.w3c.d
for (int i = sequenceIndex; i < selectedLayoutTokenSequences.size(); i++) {
int posStartSequence = -1;
List<LayoutToken> selectedLayoutTokenSequence = selectedLayoutTokenSequences.get(i);
if (selectedLayoutTokenSequence != null && selectedLayoutTokenSequence.size() > 0) {
if (CollectionUtils.isNotEmpty(selectedLayoutTokenSequence)) {
posStartSequence = selectedLayoutTokenSequence.get(0).getOffset();
String localText = LayoutTokensUtil.toText(selectedLayoutTokenSequence);
if (posStartSequence <= globalContextOffset && globalContextOffset < posStartSequence + localText.length()) {
Expand All @@ -2659,7 +2688,7 @@ public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(org.w3c.d
}

context = entity.getContext();
if (context == null || context.trim().length() == 0) {
if (StringUtils.isNotBlank(context)) {
// this should never happen, but just in case...
entity.setContext(oldContext);
context = oldContext;
Expand Down Expand Up @@ -2851,7 +2880,7 @@ public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(org.w3c.d
Collections.sort(entities);

// finally classify the context for predicting the role of the software mention
entities = SoftwareContextClassifier.getInstance(softwareConfiguration).classifyDocumentContexts(entities);
// entities = SoftwareContextClassifier.getInstance(softwareConfiguration).classifyDocumentContexts(entities);

return Pair.of(entities, resCitations);
}
Expand Down
Loading