softcite · lfoppiano · Dec 1, 2025
diff --git a/src/main/java/org/grobid/core/engines/SoftwareParser.java b/src/main/java/org/grobid/core/engines/SoftwareParser.java
@@ -3,6 +3,7 @@
 import nu.xom.Attribute;
 import nu.xom.Element;
 import org.apache.commons.collections4.CollectionUtils;
+import org.apache.commons.collections4.IteratorUtils;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.tuple.Pair;
@@ -215,8 +216,7 @@ public List<SoftwareEntity> processText(String text, boolean disambiguate) throw
         List<LayoutToken> tokens = SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(text);
         if (CollectionUtils.isEmpty(tokens)) {
             return null;
-        } else if (tokens.size() > 512 && softwareConfiguration.getModel("software").engine.equals("delft")) {
-//        } else if (tokens.size() > 512) {
+        } else if (needToSplitInSentences(tokens)) {
             String tempText = LayoutTokensUtil.toText(tokens);
             List<OffsetPosition> offsetPositions = SentenceUtilities.getInstance().runSentenceDetection(tempText);
             List<List<LayoutToken>> outputTokens = offsetPositions.stream()
@@ -346,6 +346,16 @@ public List<SoftwareEntity> processText(String text, boolean disambiguate) throw
         return entities;
     }
 
+    private boolean needToSplitInSentences(List<LayoutToken> tokens) {
+        return tokens.size() > 512 && isSequenceLabellingUsingDL();
+    }
+
+    private boolean isSequenceLabellingUsingDL() {
+//        return true;
+        return softwareConfiguration.getModel("software").engine.equals("delft")
+            || softwareConfiguration.getModel("software-type").engine.equals("delft");
+    }
+
     /**
      * Extract all Software mentions from a pdf file
      */
@@ -825,14 +835,14 @@ public Pair<List<SoftwareEntity>, Document> processPDF(File file,
      */
     private List<SoftwareEntity> processLayoutTokenSequence(
         List<LayoutToken> layoutTokens,
-            List<SoftwareEntity> entities,
-            boolean disambiguate,
-            boolean addParagraphContext,
-            boolean fromPDF,
-            boolean fromXML,
-            List<PDFAnnotation> pdfAnnotations
+        List<SoftwareEntity> entities,
+        boolean disambiguate,
+        boolean addParagraphContext,
+        boolean fromPDF,
+        boolean fromXML,
+        List<PDFAnnotation> pdfAnnotations
     ) {
-        List<LayoutTokenization> layoutTokenizations = new ArrayList<LayoutTokenization>();
+        List<LayoutTokenization> layoutTokenizations = new ArrayList<>();
         layoutTokenizations.add(new LayoutTokenization(layoutTokens));
         return processLayoutTokenSequences(layoutTokenizations, entities, disambiguate, addParagraphContext, fromPDF, fromXML, pdfAnnotations);
     }
@@ -842,11 +852,12 @@ private List<SoftwareEntity> processLayoutTokenSequence(
      */
     private List<SoftwareEntity> processLayoutTokenSequenceMultiple(
         List<List<LayoutToken>> layoutTokenList,
-                                                                    List<SoftwareEntity> entities,
-                                                                    boolean disambiguate,
-                                                                    boolean addParagraphContext,
-                                                                    boolean fromPDF,
-                                                                    boolean fromXML) {
+        List<SoftwareEntity> entities,
+        boolean disambiguate,
+        boolean addParagraphContext,
+        boolean fromPDF,
+        boolean fromXML
+    ) {
 
         return processLayoutTokenSequenceMultiple(
             layoutTokenList,
@@ -868,7 +879,7 @@ private List<SoftwareEntity> processLayoutTokenSequenceMultiple(
         boolean fromXML,
         List<PDFAnnotation> pdfAnnotations
     ) {
-        List<LayoutTokenization> layoutTokenizations = new ArrayList<LayoutTokenization>();
+        List<LayoutTokenization> layoutTokenizations = new ArrayList<>();
         for (List<LayoutToken> layoutTokens : layoutTokenList)
             layoutTokenizations.add(new LayoutTokenization(layoutTokens));
         return processLayoutTokenSequences(layoutTokenizations, entities, disambiguate, addParagraphContext, fromPDF, fromXML, pdfAnnotations);
@@ -891,7 +902,7 @@ private List<SoftwareEntity> processLayoutTokenSequences(
             List<LayoutToken> layoutTokens = layoutTokenization.getTokenization();
             layoutTokens = SoftwareAnalyzer.getInstance().retokenizeLayoutTokens(layoutTokens);
 
-            if ((layoutTokens == null) || (layoutTokens.size() == 0))
+            if (CollectionUtils.isEmpty(layoutTokens))
                 continue;
 
             // positions for lexical match
@@ -910,7 +921,7 @@ private List<SoftwareEntity> processLayoutTokenSequences(
 
         // labeled result from sequence labelling lib
         String allResString = allRess.toString();
-        if (allResString.trim().length() == 0) {
+        if (StringUtils.isBlank(allResString)) {
             // empty content, nothing more to do
             return entities;
         }
@@ -922,7 +933,7 @@ private List<SoftwareEntity> processLayoutTokenSequences(
             List<LayoutToken> layoutTokens = layoutTokenization.getTokenization();
             layoutTokens = SoftwareAnalyzer.getInstance().retokenizeLayoutTokens(layoutTokens);
 
-            if ((layoutTokens == null) || (layoutTokens.size() == 0))
+            if (CollectionUtils.isEmpty(layoutTokens))
                 continue;
 
             // text of the selected segment
@@ -933,8 +944,9 @@ private List<SoftwareEntity> processLayoutTokenSequences(
 
             String localRes = resBlocks[l];
 
-            if (localRes == null || localRes.length() == 0)
+            if (StringUtils.isBlank(localRes)) {
                 continue;
+            }
 
             List<SoftwareComponent> components = extractSoftwareComponents(text, localRes, layoutTokens);
             l++;
@@ -968,10 +980,10 @@ private List<SoftwareEntity> processLayoutTokenSequences(
             //text = LayoutTokensUtil.normalizeDehyphenizeText(layoutTokens);
 
             // finally refine software types, if there is anything to refine
-            if (localEntities.size() > 0) {
+            if (CollectionUtils.isNotEmpty(localEntities)) {
                 try {
                     List<SoftwareType> entityTypes = softwareTypeParser.processFeatureInput(text, localRes, layoutTokens);
-                    if (entityTypes != null && entityTypes.size() > 0) {
+                    if (CollectionUtils.isNotEmpty(entityTypes)) {
                         localEntities = refineTypes(localEntities, entityTypes);
                         Collections.sort(localEntities);
                     }
@@ -1066,15 +1078,17 @@ private List<SoftwareEntity> processLayoutTokenSequenceTableFigure(
         return entities;
     }
 
-    public List<SoftwareEntity> propagateLayoutTokenSequence(List<LayoutToken> layoutTokens,
-                                                             List<SoftwareEntity> entities,
-                                                             Map<String, Double> termProfiles,
-                                                             FastMatcher termPattern,
-                                                             List<OffsetPosition> placeTaken,
-                                                             Map<String, Integer> frequencies,
-                                                             boolean addParagraphContext,
-                                                             boolean fromPDF,
-                                                             boolean fromXML) {
+    public List<SoftwareEntity> propagateLayoutTokenSequence(
+        List<LayoutToken> layoutTokens,
+        List<SoftwareEntity> entities,
+        Map<String, Double> termProfiles,
+        FastMatcher termPattern,
+        List<OffsetPosition> placeTaken,
+        Map<String, Integer> frequencies,
+        boolean addParagraphContext,
+        boolean fromPDF,
+        boolean fromXML
+    ) {
         // possible offset of the sequence in the complete document tokenization
         int offsetShift = 0;
         if (layoutTokens != null && layoutTokens.size() > 0 && layoutTokens.get(0).getOffset() != 0) {
@@ -1813,23 +1827,25 @@ public String addFeatures(List<LayoutToken> tokens,
     /**
      * Add to the entities their local text contexts: the sentence where the software name occurs.
      */
-    public List<SoftwareEntity> addContext(List<SoftwareEntity> entities,
-                                           String text,
-                                           List<LayoutToken> tokens,
-                                           boolean fromPDF,
-                                           boolean fromXML,
-                                           boolean addParagraphContext) {
+    public List<SoftwareEntity> addContext(
+        List<SoftwareEntity> entities,
+        String text,
+        List<LayoutToken> tokens,
+        boolean fromPDF,
+        boolean fromXML,
+        boolean addParagraphContext
+    ) {
         // adjust offsets if tokenization does not start at 0
         int offsetShift = 0;
-        if (tokens != null && tokens.size() > 0 && tokens.get(0).getOffset() != 0) {
+        if (CollectionUtils.isNotEmpty(tokens) && tokens.get(0).getOffset() != 0) {
             offsetShift = tokens.get(0).getOffset();
         }
 
         //System.out.println("offsetShift: " + offsetShift);
 
         // we start by segmenting the tokenized text into sentences
 
-        List<OffsetPosition> forbidden = new ArrayList<OffsetPosition>();
+        List<OffsetPosition> forbidden = new ArrayList<>();
         // fill the position where entities occur to avoid segmenting in the middle of a software string, same for
         // the reference marker positions too
         for (SoftwareEntity entity : entities) {
@@ -2454,9 +2470,11 @@ public int boostrapTrainingPDF(String inputDirectory,
     /**
      * Extract all software mentions from a publisher XML file
      */
-    public Pair<List<SoftwareEntity>, List<BibDataSet>> processXML(File file,
-                                                                   boolean disambiguate,
-                                                                   boolean addParagraphContext) throws IOException {
+    public Pair<List<SoftwareEntity>, List<BibDataSet>> processXML(
+        File file,
+        boolean disambiguate,
+        boolean addParagraphContext
+    ) throws IOException {
         Pair<List<SoftwareEntity>, List<BibDataSet>> resultExtraction = null;
         try {
             String tei = processXML(file);
@@ -2484,9 +2502,11 @@ public Pair<List<SoftwareEntity>, List<BibDataSet>> processXML(File file,
     /**
      * Extract all software mentions from a publisher XML file
      */
-    public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEI(File file,
-                                                                   boolean disambiguate,
-                                                                   boolean addParagraphContext) throws IOException {
+    public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEI(
+        File file,
+        boolean disambiguate,
+        boolean addParagraphContext
+    ) throws IOException {
         Pair<List<SoftwareEntity>, List<BibDataSet>> resultExtraction = null;
         try {
             DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
@@ -2546,9 +2566,11 @@ public String processXML(File file) throws Exception {
     /**
      * Extract all software mentions from a publisher XML file
      */
-    public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(org.w3c.dom.Document doc,
-                                                                           boolean disambiguate,
-                                                                           boolean addParagraphContext) {
+    public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(
+        org.w3c.dom.Document doc,
+        boolean disambiguate,
+        boolean addParagraphContext
+    ) {
         List<SoftwareEntity> entities = new ArrayList<>();
 
         List<List<LayoutToken>> selectedLayoutTokenSequencesRaw = new ArrayList<>();
@@ -2577,9 +2599,9 @@ public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(org.w3c.d
             if (StringUtils.isNotBlank(contentText)) {
                 List<LayoutToken> paragraphTokens =
                     SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(contentText);
-                String orginalText = UnicodeUtil.normaliseText(paragraphElement.getTextContent());
+                String originalText = UnicodeUtil.normaliseText(paragraphElement.getTextContent());
                 List<LayoutToken> originalParagraphTokens =
-                    SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(orginalText);
+                    SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(originalText);
                 if (CollectionUtils.isNotEmpty(paragraphTokens)) {
                     // shift the paragraph tokens to the global position
                     for (LayoutToken paragraphToken : paragraphTokens) {
@@ -2600,28 +2622,35 @@ public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(org.w3c.d
             }
         }
 
-        logger.warn("Number of sequences too large for the BERT model: "
-            + selectedLayoutTokenSequencesRaw.stream().filter(s -> s.size() > 512).count());
+        List<List<LayoutToken>> selectedLayoutTokenSequences = new ArrayList<>(selectedLayoutTokenSequencesRaw);
 
-        // Run the sentence segmenter for sequences that are larger than 512 tokens, what the BERT model can handle.
-        List<List<LayoutToken>> selectedLayoutTokenSequences = selectedLayoutTokenSequencesRaw.stream()
-            .flatMap(tokens -> {
-                    if (tokens.size() > 512) {
-                        String tempText = LayoutTokensUtil.toText(tokens);
-                        List<OffsetPosition> offsetPositions = SentenceUtilities.getInstance().runSentenceDetection(tempText);
-                        List<List<LayoutToken>> splitTokens =offsetPositions.stream()
-                            .map(op -> tempText.substring(op.start, op.end))
-                            .map(s -> SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(s))
-                            .collect(Collectors.toList());
-                        return splitTokens.stream();
-                    }
-                    return Stream.of(tokens);
-                }
-            ).collect(Collectors.toList());
+        if (isSequenceLabellingUsingDL()) {
+            logger.warn("Number of sequences too large for the BERT model: "
+                + selectedLayoutTokenSequencesRaw.stream().filter(s -> s.size() > 512).count());
+
+            // Run the sentence segmenter for sequences that are larger than 512 tokens, what the BERT model can handle.
+            List<List<LayoutToken>> tempSequences = new ArrayList<>();
+            for (List<LayoutToken> tokens : selectedLayoutTokenSequencesRaw) {
+                if (tokens.size() > 512) {
+                    // Split large sequences into sentences
+                    String tempText = LayoutTokensUtil.toText(tokens);
+                    int offset = tokens.get(0).getOffset();
+                    List<OffsetPosition> offsetPositions = SentenceUtilities.getInstance().runSentenceDetection(tempText);
 
-        logger.warn("Number of sequences too large for the BERT model: "
-            + selectedLayoutTokenSequences.stream().filter(s -> s.size() > 512).count());
+                    for (OffsetPosition op : offsetPositions) {
+                        String sentenceText = tempText.substring(op.start, op.end);
+                        List<LayoutToken> sentenceTokens = SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(sentenceText);
+                        sentenceTokens.stream().forEach(token -> token.setOffset(token.getOffset() + offset + op.start));
 
+                        tempSequences.add(sentenceTokens);
+                    }
+                } else {
+                    // Keep small sequences as-is
+                    tempSequences.add(tokens);
+                }
+            }
+            selectedLayoutTokenSequences = tempSequences;
+        }
 
         processLayoutTokenSequenceMultiple(selectedLayoutTokenSequences, entities, disambiguate, addParagraphContext, false, true);
         selectedLayoutTokenSequences = selectedOriginalLayoutTokenSequences;
@@ -2633,7 +2662,7 @@ public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(org.w3c.d
                 String context = entity.getContext();
                 if (context == null)
                     continue;
-                String oldContext = new String(entity.getContext());
+                String oldContext = entity.getContext();
 
                 int paragraphContextOffset = entity.getParagraphContextOffset();
                 int globalContextOffset = entity.getGlobalContextOffset();
@@ -2645,7 +2674,7 @@ public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(org.w3c.d
                 for (int i = sequenceIndex; i < selectedLayoutTokenSequences.size(); i++) {
                     int posStartSequence = -1;
                     List<LayoutToken> selectedLayoutTokenSequence = selectedLayoutTokenSequences.get(i);
-                    if (selectedLayoutTokenSequence != null && selectedLayoutTokenSequence.size() > 0) {
+                    if (CollectionUtils.isNotEmpty(selectedLayoutTokenSequence)) {
                         posStartSequence = selectedLayoutTokenSequence.get(0).getOffset();
                         String localText = LayoutTokensUtil.toText(selectedLayoutTokenSequence);
                         if (posStartSequence <= globalContextOffset && globalContextOffset < posStartSequence + localText.length()) {
@@ -2659,7 +2688,7 @@ public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(org.w3c.d
                 }
 
                 context = entity.getContext();
-                if (context == null || context.trim().length() == 0) {
+                if (StringUtils.isNotBlank(context)) {
                     // this should never happen, but just in case...
                     entity.setContext(oldContext);
                     context = oldContext;
@@ -2851,7 +2880,7 @@ public Pair<List<SoftwareEntity>, List<BibDataSet>> processTEIDocument(org.w3c.d
         Collections.sort(entities);
 
         // finally classify the context for predicting the role of the software mention
-        entities = SoftwareContextClassifier.getInstance(softwareConfiguration).classifyDocumentContexts(entities);
+//        entities = SoftwareContextClassifier.getInstance(softwareConfiguration).classifyDocumentContexts(entities);
 
         return Pair.of(entities, resCitations);
     }