| Title: | Extract Information from Clinical Reports from 'Oncomine Reporter' and NCBI 'ClinVar' |
|---|---|
| Description: | Clinical reports generated by 'Oncomine Reporter' software contain critical data in unstructured PDF format, making manual extraction time-consuming and error-prone. 'ORscraper' provides a coherent suite of functions to automate this process, allowing researchers to parse reports, identify key biomarkers, extract genetic variant tables, and filter results. It also integrates with the NCBI 'ClinVar' API <https://www.ncbi.nlm.nih.gov/clinvar/> to enrich extracted data. |
| Authors: | Samuel González [aut, cre] (ORCID: <https://orcid.org/0009-0007-9531-9821>), Antonio Jesus Canepa [ctb] (ORCID: <https://orcid.org/0000-0002-0608-2743>), Patricia Saiz [ctb] (ORCID: <https://orcid.org/0000-0001-7106-5192>), María González [ctb] (ORCID: <https://orcid.org/0009-0000-1887-4644>) |
| Maintainer: | Samuel González <[email protected]> |
| License: | MIT + file LICENSE |
| Version: | 0.1.1 |
| Built: | 2026-05-09 08:14:50 UTC |
| Source: | https://github.com/samuelgonzalez0204/orscraper |
This function analyzes biopsy identifiers and categorizes them into specific types based on a defined rule.
classify_biopsy(biopsy_numbers)classify_biopsy(biopsy_numbers)
biopsy_numbers |
Character vector. Identifiers of biopsies to classify. |
A character vector representing the type of Sample type: 1, biopsy 2, aspiration 3, cytology
InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) # Example with the first file NB_values <- c() NB_values <- extract_intermediate_values(NB_values, lines, "biopsia:") biopsies_identifiers <- classify_biopsy(NB_values)InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) # Example with the first file NB_values <- c() NB_values <- extract_intermediate_values(NB_values, lines, "biopsia:") biopsies_identifiers <- classify_biopsy(NB_values)
This function retrieves chip values from file names matching a specific pattern.
extract_chip_id(files)extract_chip_id(files)
files |
Character vector. File names to process. |
A character vector of chip identifiers extracted from the file names.
InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) chips <- extract_chip_id(files)InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) chips <- extract_chip_id(files)
This function identifies and extracts fusion variants from text lines based on specific patterns.
extract_fusions(lines, mutations)extract_fusions(lines, mutations)
lines |
Character vector. Lines of text to search for fusion variants. |
mutations |
Character vector. List of mutations to look for. |
A list of fusion variants identified in the text.
InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) # Example with the first file genes_file <- system.file("extdata/Genes.xlsx", package = "ORscraper") genes <- readxl::read_excel(genes_file) mutations <- unique(genes$GEN) fusions <- extract_fusions(lines, mutations)InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) # Example with the first file genes_file <- system.file("extdata/Genes.xlsx", package = "ORscraper") genes <- readxl::read_excel(genes_file) mutations <- unique(genes$GEN) fusions <- extract_fusions(lines, mutations)
This function retrieves unique matches for a search pattern within text lines.
extract_intermediate_values(list_input, lines, search_text)extract_intermediate_values(list_input, lines, search_text)
list_input |
List. The list to append extracted values to. |
lines |
Character vector. The text lines to search within. |
search_text |
Character. The pattern to search for. |
An updated list with appended values.
InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) # Example with the first file NHC_Data <- NB_values <- dates <- textDiag <- c() NHC_Data <- extract_intermediate_values(NHC_Data, lines, "NHC:") NB_values <- extract_intermediate_values(NB_values, lines, "biopsia:") dates <- extract_intermediate_values(dates, lines, "Fecha:") textDiag <- extract_intermediate_values(textDiag, lines, "de la muestra:")InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) # Example with the first file NHC_Data <- NB_values <- dates <- textDiag <- c() NHC_Data <- extract_intermediate_values(NHC_Data, lines, "NHC:") NB_values <- extract_intermediate_values(NB_values, lines, "biopsia:") dates <- extract_intermediate_values(dates, lines, "Fecha:") textDiag <- extract_intermediate_values(textDiag, lines, "de la muestra:")
This function analyzes a subset of text lines, extracting information such as mutations, pathogenicity, frequencies, codifications and changes.
extract_values_from_tables( lines, mutations, genes_mutated = list(), pathogenicity = list(), frequencies = list(), codifications = list(), changes = list(), values = list(), start = "Variantes de secuencia de ADN", start2 = " Variaciones del número de copias", end = "Genes analizados", end2 = "Comentarios adicionales sobre las variantes" )extract_values_from_tables( lines, mutations, genes_mutated = list(), pathogenicity = list(), frequencies = list(), codifications = list(), changes = list(), values = list(), start = "Variantes de secuencia de ADN", start2 = " Variaciones del número de copias", end = "Genes analizados", end2 = "Comentarios adicionales sobre las variantes" )
lines |
Character vector. Lines of text to process. |
mutations |
Character vector. List of known mutation identifiers. |
genes_mutated |
Ordered list to store extracted gene data. |
pathogenicity |
Ordered list to store extracted pathogenicity information. |
frequencies |
Ordered list to store extracted frequency data. |
codifications |
Ordered list to store extracted codification data. |
changes |
Ordered list to store extracted changes data. |
values |
Aggregated list of extracted information. |
start |
Starting marker for the relevant table section. |
start2 |
Secondary starting marker for the table section, in case the table is divided in two pages. |
end |
text marker indicating the end of the subset. |
end2 |
secondary end marker. |
A list containing extracted data: genes, pathogenicity, frequencies, codifications and changes.
InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) # Example with the first file genes_file <- system.file("extdata/Genes.xlsx", package = "ORscraper") genes <- readxl::read_excel(genes_file) mutations <- unique(genes$GEN) TableValues <- extract_values_from_tables(lines, mutations) mutateGenes <- TableValues[[1]] pathogenity <- TableValues[[2]] frequencies <- TableValues[[3]] codifications <- TableValues[[4]] changes <- TableValues[[5]]InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) # Example with the first file genes_file <- system.file("extdata/Genes.xlsx", package = "ORscraper") genes <- readxl::read_excel(genes_file) mutations <- unique(genes$GEN) TableValues <- extract_values_from_tables(lines, mutations) mutateGenes <- TableValues[[1]] pathogenity <- TableValues[[2]] frequencies <- TableValues[[3]] codifications <- TableValues[[4]] changes <- TableValues[[5]]
This function appends extracted variable values based on start or end markers to a list.
extract_values_start_end(list_input, lines, pattern)extract_values_start_end(list_input, lines, pattern)
list_input |
List. The list to append extracted values to. |
lines |
Character vector. The text lines to search within. |
pattern |
Character. The pattern to search for. |
An updated list with appended values.
InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) # Example with the first file diagnostic <- gender <- tumor_cell_percentage <- quality <- c() diagnostic <- extract_values_start_end(diagnostic, lines, ".*Diagnóstico:\\s") gender <- extract_values_start_end(gender, lines, ".*Sexo:\\s*") tumor_cell_percentage <- extract_values_start_end( tumor_cell_percentage, lines, ".*% células tumorales:\\s") quality <- extract_values_start_end( quality, lines, ".*CALIDAD DE LA MUESTRA /LIMITACIONES PARA SU ANÁLISIS:\\s")InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) # Example with the first file diagnostic <- gender <- tumor_cell_percentage <- quality <- c() diagnostic <- extract_values_start_end(diagnostic, lines, ".*Diagnóstico:\\s") gender <- extract_values_start_end(gender, lines, ".*Sexo:\\s*") tumor_cell_percentage <- extract_values_start_end( tumor_cell_percentage, lines, ".*% células tumorales:\\s") quality <- extract_values_start_end( quality, lines, ".*CALIDAD DE LA MUESTRA /LIMITACIONES PARA SU ANÁLISIS:\\s")
This function searches for a specific pattern in text lines and extracts the corresponding value.
extract_variable(lines, search_text)extract_variable(lines, search_text)
lines |
Character vector. The lines of text to search within. |
search_text |
Character. The regular expression pattern to match. |
The extracted value as a character, or "Null" if not found.
This function filters a list of pathogenicity classifications, retaining only those marked as "Pathogenic".
filter_pathogenic_only(pathogenic_list, related_list)filter_pathogenic_only(pathogenic_list, related_list)
pathogenic_list |
List. A list of pathogenicity classifications. |
related_list |
List. A list of corresponding data to filter alongside pathogenicity. |
A list containing only the elements of the related list corresponding to "Pathogenic" classifications.
InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) # Example with the first file genes_file <- system.file("extdata/Genes.xlsx", package = "ORscraper") genes <- readxl::read_excel(genes_file) mutations <- unique(genes$GEN) TableValues <- extract_values_from_tables(lines, mutations) mutateGenes <- TableValues[[1]] pathogenity <- TableValues[[2]] frequencies <- TableValues[[3]] changes <- TableValues[[5]] pathogenic_mutations <- filter_pathogenic_only(pathogenity, mutateGenes) pathogenic_changes <- filter_pathogenic_only(pathogenity, changes) pathogenic_frequencies <- filter_pathogenic_only(pathogenity, frequencies)InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) # Example with the first file genes_file <- system.file("extdata/Genes.xlsx", package = "ORscraper") genes <- readxl::read_excel(genes_file) mutations <- unique(genes$GEN) TableValues <- extract_values_from_tables(lines, mutations) mutateGenes <- TableValues[[1]] pathogenity <- TableValues[[2]] frequencies <- TableValues[[3]] changes <- TableValues[[5]] pathogenic_mutations <- filter_pathogenic_only(pathogenity, mutateGenes) pathogenic_changes <- filter_pathogenic_only(pathogenity, changes) pathogenic_frequencies <- filter_pathogenic_only(pathogenity, frequencies)
This function extracts lines from a text based on specified start and end markers.
narrow_text( start_text, start_text2 = " Variaciones del número de copias", lines_total, text_limit, text_limit2 = "Comentarios adicionales sobre las variantes" )narrow_text( start_text, start_text2 = " Variaciones del número de copias", lines_total, text_limit, text_limit2 = "Comentarios adicionales sobre las variantes" )
start_text |
Character. The text marker indicating the beginning of the subset. |
start_text2 |
Character. An optional secondary start marker. |
lines_total |
Character vector. The full set of text lines. |
text_limit |
Character vector. The text marker indicating the end of the subset. |
text_limit2 |
Character vector. An optional secondary end marker. |
A character vector containing the extracted lines.
This function extracts the text content from a PDF file and splits it into individual lines.
read_pdf_content(file_path)read_pdf_content(file_path)
file_path |
Character. The path to the PDF file. |
A character vector, where each element is a line from the PDF content.
InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) head(lines)InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) head(lines)
This function scans a specified directory and retrieves all files with a .pdf extension.
read_pdf_files(path)read_pdf_files(path)
path |
Character. Path to the directory to scan for PDF files. |
A character vector with the full paths of the PDF files.
InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath)InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath)
This function queries the NCBI ClinVar database for germline classifications based on gene and codification data.
search_ncbi_clinvar(pathogenicity, genes_mutated, total_codifications)search_ncbi_clinvar(pathogenicity, genes_mutated, total_codifications)
pathogenicity |
Ordered list. Existing pathogenicity data. |
genes_mutated |
Ordered list. Existing mutated gene data. |
total_codifications |
Ordered list. Existing mutated gen codification data. |
An updated list of pathogenicity classifications based on NCBI ClinVar search results.
InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) # Example with the first file genes_file <- system.file("extdata/Genes.xlsx", package = "ORscraper") if (requireNamespace("readxl", quietly = TRUE)) { genes <- readxl::read_excel(genes_file) mutations <- unique(genes$GEN) TableValues <- extract_values_from_tables(lines, mutations) mutateGenes <- TableValues[[1]] pathogenity <- TableValues[[2]] codifications <- TableValues[[4]] search_pathogenity <- search_ncbi_clinvar(pathogenity, mutateGenes, codifications) }InputPath <- system.file("extdata", package = "ORscraper") files <- read_pdf_files(InputPath) lines <- read_pdf_content(files[1]) # Example with the first file genes_file <- system.file("extdata/Genes.xlsx", package = "ORscraper") if (requireNamespace("readxl", quietly = TRUE)) { genes <- readxl::read_excel(genes_file) mutations <- unique(genes$GEN) TableValues <- extract_values_from_tables(lines, mutations) mutateGenes <- TableValues[[1]] pathogenity <- TableValues[[2]] codifications <- TableValues[[4]] search_pathogenity <- search_ncbi_clinvar(pathogenity, mutateGenes, codifications) }
This function searches for a specific text pattern in a set of lines and extracts values that follow the pattern.
search_value(search_text, lines)search_value(search_text, lines)
search_text |
Character. The pattern to search for in the text lines. |
lines |
Character vector. The lines of text to search within. |
A character vector with extracted values matching the search criteria.