Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
248 changes: 248 additions & 0 deletions src/main/java/org/grobid/core/data/ArticleBiblio.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
package org.grobid.core.data;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.NodeList;

import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;

/**
* Data class to hold article metadata (DOI, title, authors) extracted from documents.
* This class provides a clean separation from BiblioComponent which is designed
* for reference components within software mentions.
*/
public class ArticleBiblio {
private static final Logger LOGGER = LoggerFactory.getLogger(ArticleBiblio.class);

private String doi;
private String title;
private String authors;

public ArticleBiblio() {
}

public ArticleBiblio(String doi, String title, String authors) {
this.doi = doi;
this.title = title;
this.authors = authors;
}

// Getters and setters
public String getDoi() {
return doi;
}

public void setDoi(String doi) {
this.doi = doi;
}

public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

public void setAuthors(String authors) {
this.authors = authors;
}

public String getAuthors() {
return this.authors;
}

/**
* Check if this metadata article has any meaningful content
*/
public boolean hasContent() {
return (StringUtils.isNotBlank(doi)) ||
(StringUtils.isNotBlank(title)) ||
(StringUtils.isNotBlank(authors));
}

/**
* Convert this MetadataArticle to JSON string for API response
*/
public String toJson() {
ObjectMapper mapper = new ObjectMapper();
StringBuilder json = new StringBuilder();

json.append("\"biblio\": {");
boolean firstField = true;

// Add DOI if available
if (doi != null && !doi.trim().isEmpty()) {
if (!firstField) json.append(", ");
try {
json.append("\"doi\": ").append(mapper.writeValueAsString(doi));
} catch (JsonProcessingException e) {
json.append("\"doi\": \"\"");
}
firstField = false;
}

// Add title if available
if (title != null && !title.trim().isEmpty()) {
if (!firstField) json.append(", ");
try {
json.append("\"title\": ").append(mapper.writeValueAsString(title));
} catch (JsonProcessingException e) {
json.append("\"title\": \"\"");
}
firstField = false;
}

// Add authors if available
if (StringUtils.isNotBlank(authors)) {
if (!firstField) json.append(", ");
json.append("\"authors\": \"" + authors + "\"");
}

json.append("}");
return json.toString();
}

/**
* Create MetadataArticle from BiblioItem
*/
public static Optional<ArticleBiblio> fromBiblioItem(BiblioItem biblioItem) {
if (biblioItem == null) {
LOGGER.debug("BiblioItem is null, cannot create MetadataArticle");
return Optional.empty();
}

LOGGER.debug("Creating MetadataArticle from BiblioItem");
ArticleBiblio metadata = new ArticleBiblio();

if (biblioItem.getDOI() != null && !biblioItem.getDOI().trim().isEmpty()) {
metadata.setDoi(biblioItem.getDOI());
LOGGER.debug("Extracted DOI: " + biblioItem.getDOI());
}

if (biblioItem.getTitle() != null && !biblioItem.getTitle().trim().isEmpty()) {
metadata.setTitle(biblioItem.getTitle());
LOGGER.debug("Extracted title: " + biblioItem.getTitle());
}

String authors = LayoutTokensUtil.normalizeText(biblioItem.getAuthors());
if (StringUtils.isNotBlank(authors)) {
metadata.setAuthors(authors);
}

return metadata.hasContent() ? Optional.of(metadata) : Optional.empty();
}

/**
* Extract article metadata from TEI XML Document using XPath
*/
public static Optional<ArticleBiblio> fromTeiDocument(org.w3c.dom.Document teiDocument) {
if (teiDocument == null) {
return Optional.empty();
}

try {
XPathFactory xPathFactory = XPathFactory.newInstance();
XPath xpath = xPathFactory.newXPath();

String title = extractTitle(teiDocument, xpath);
String doi = extractDOI(teiDocument, xpath);
List<String> authors = extractAuthors(teiDocument, xpath);

ArticleBiblio articleMetadata = new ArticleBiblio();
articleMetadata.setDoi(doi);
articleMetadata.setTitle(title);

if (CollectionUtils.isNotEmpty(authors)) {
articleMetadata.setAuthors(String.join(", ", authors));
}

return articleMetadata.hasContent() ? Optional.of(articleMetadata) : Optional.empty();
} catch (Exception e) {
return Optional.empty();
}
}

private static String extractTitle(org.w3c.dom.Document doc, XPath xpath) {
try {
NodeList titleNodes = (NodeList) xpath.evaluate("//teiHeader/fileDesc/titleStmt/title[@level='a'][@type='main']/text()", doc, XPathConstants.NODESET);
if (titleNodes != null && titleNodes.getLength() > 0) {
String title = titleNodes.item(0).getNodeValue().trim();
return title.isEmpty() ? "" : title;
}
} catch (Exception e) {
}
return "";
}

private static String extractDOI(org.w3c.dom.Document doc, XPath xpath) {
try {
NodeList doiNodes = (NodeList) xpath.evaluate("//teiHeader/fileDesc/sourceDesc/biblStruct/idno[2]/text()", doc, XPathConstants.NODESET);
if (doiNodes != null && doiNodes.getLength() > 0) {
String doi = doiNodes.item(0).getNodeValue().trim();
return doi.isEmpty() ? "" : doi;
}
} catch (Exception e) {
}
return "";
}

private static List<String> extractAuthors(org.w3c.dom.Document doc, XPath xpath) {
List<String> authors = new ArrayList<>();
try {
NodeList authorNodes = (NodeList) xpath.evaluate("//teiHeader/fileDesc/sourceDesc/biblStruct/analytic/author/persName", doc, XPathConstants.NODESET);
for (int i = 0; i < authorNodes.getLength(); i++) {
String author = formatAuthorFromNode(authorNodes.item(i));
if (!author.isEmpty() && !authors.contains(author)) {
authors.add(author);
}
}
} catch (Exception e) {
}
return authors;
}

/**
* Format author from XML node as "surname, name"
*/
private static String formatAuthorFromNode(org.w3c.dom.Node node) {
if (node == null) return "";

if (node.getNodeName().equals("persName")) {
String surname = "";
String forename = "";

NodeList childNodes = node.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
org.w3c.dom.Node child = childNodes.item(i);
if (child.getNodeName().equals("surname")) {
surname = child.getTextContent().trim();
} else if (child.getNodeName().equals("forename")) {
forename = child.getTextContent().trim();
}
}

if (!surname.isEmpty()) {
return forename.isEmpty() ? surname : surname + ", " + forename;
}
}

return node.getTextContent().trim();
}

@Override
public String toString() {
return String.format("MetadataArticle{doi='%s', title='%s', authors=%s}",
doi, title, authors);
}
}
14 changes: 3 additions & 11 deletions src/main/java/org/grobid/core/data/BiblioComponent.java
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
package org.grobid.core.data;

import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.utilities.TextUtilities;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.lexicon.SoftwareLexicon;
import org.grobid.core.layout.BoundingBox;
import org.grobid.core.layout.LayoutToken;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;

import java.util.List;

import org.grobid.core.layout.BoundingBox;
import org.grobid.core.utilities.TextUtilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -82,7 +74,7 @@ public String toJson() {
}
}*/
buffer.append(", \"refKey\": " + refKey);

// knowledge information
if (wikidataId != null) {
buffer.append(", \"wikidataId\": \"" + wikidataId + "\"");
Expand Down
Original file line number Diff line number Diff line change
@@ -1,36 +1,21 @@
package org.grobid.core.engines;

import java.util.*;

import org.apache.commons.io.FileUtils;
import org.grobid.core.GrobidModels;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.factory.GrobidFactory;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.layout.LayoutTokenization;
import org.grobid.core.utilities.*;
import org.grobid.core.jni.PythonEnvironmentConfig;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.data.SoftwareContextAttributes;
import org.grobid.core.data.SoftwareEntity;
import org.grobid.core.jni.DeLFTClassifierModel;
import org.grobid.core.utilities.GrobidConfig.ModelParameters;
import org.grobid.core.utilities.SoftwareConfiguration;
import org.grobid.core.utilities.TextUtilities;
import org.grobid.core.data.SoftwareEntity;
import org.grobid.core.data.SoftwareContextAttributes;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.SystemUtils;
import org.apache.commons.lang3.tuple.Pair;

import com.fasterxml.jackson.core.*;
import com.fasterxml.jackson.databind.*;
import com.fasterxml.jackson.databind.node.*;
import com.fasterxml.jackson.annotation.*;
import com.fasterxml.jackson.core.io.*;

import static org.apache.commons.lang3.ArrayUtils.isEmpty;
import java.util.*;

/**
* Use a Deep Learning multiclass and multilabel classifier to characterize the context of a recognized software mention.
Expand Down Expand Up @@ -132,7 +117,7 @@ public String classify(String text, MODEL_TYPE type) throws Exception {
* @return list of predicted labels/scores pairs for each text
*/
public String classify(List<String> texts, MODEL_TYPE type) throws Exception {
if (texts == null || texts.size() == 0)
if (CollectionUtils.isEmpty(texts))
return null;

LOGGER.info("classify: " + texts.size() + " sentence(s) for type " + type.toString());
Expand Down
Loading