Validation Package pride

validation; pride; proteomics

v1.0.3 released on 11/15/2024

by Oliver Maus; Christopher Lux

164 Downloads

Validates if the ARC contains the necessary metadata to be publishable via PRIDE.


Install with arc-validate

 arc-validate package install pride --version 1.0.3

Include in a PLANTDataHUB CQC pipeline

validation_packages:
  - name: pride
    version: 1.0.3

Description

Validates if the ARC contains the necessary metadata to be publishable via PRIDE.

The following metadata is required:

- Investigation has title and description

- Investigation has Keywords comment in correct format

- All persons in Investigation Contacts must have a first name, last name, affiliation and valid email

- Study has protocol, tissue & species in correct format

- Assay has technology type, instrument model, and fixed and/or variable modification in correct format


Release notes

- process graph tokens correctly resolved


Browse code (v1.0.3)
let [<Literal>]PACKAGE_METADATA = """(*
---
Name: pride
Summary: Validates if the ARC contains the necessary metadata to be publishable via PRIDE.
Description: |
    Validates if the ARC contains the necessary metadata to be publishable via PRIDE.
    The following metadata is required:
        - Investigation has title and description
        - Investigation has Keywords comment in correct format
        - All persons in Investigation Contacts must have a first name, last name, affiliation and valid email
        - Study has protocol, tissue & species in correct format
        - Assay has technology type, instrument model, and fixed and/or variable modification in correct format
MajorVersion: 1
MinorVersion: 0
PatchVersion: 3
Publish: true
Authors:
  - FullName: Oliver Maus
    Email: maus@nfdi4plants.org
    Affiliation: DataPLANT
  - FullName: Christopher Lux
    Email: lux@csbiology.de
    Affiliation: RPTU Kaiserslautern
    AffiliationLink: http://rptu.de/startseite
Tags:
  - Name: validation
  - Name: pride
  - Name: proteomics
ReleaseNotes: |
  - process graph tokens correctly resolved
---
*)"""


#r "nuget: ARCExpect, 4.0.1"


open ControlledVocabulary
open Expecto
open ARCExpect
open ARCTokenization
open ARCTokenization.StructuralOntology
open System.IO


// Input:
let arcDir = Directory.GetCurrentDirectory()


// Values:
let absoluteDirectoryPaths = FileSystem.parseARCFileSystem arcDir

let investigationMetadata = 
    absoluteDirectoryPaths
    |> Investigation.parseMetadataSheetsFromTokens() arcDir 
    |> List.concat 

let studyMetadata =
    absoluteDirectoryPaths
    |> Study.parseMetadataSheetsFromTokens() arcDir
    |> List.concat

let assayMetadata =
    absoluteDirectoryPaths
    |> Assay.parseMetadataSheetsFromTokens() arcDir
    |> List.concat

let studyProcessGraphTokens = 
    try 
        absoluteDirectoryPaths
        |> Study.parseProcessGraphColumnsFromTokens arcDir
        |> Seq.collect Map.values
        |> List.concat
    with
        | _ -> List.empty

let assayProcessGraphTokens =
    try 
        absoluteDirectoryPaths
        |> Assay.parseProcessGraphColumnsFromTokens arcDir
        |> Seq.collect Map.values
        |> List.concat
    with
        | _ -> List.empty

let organismTokens =
    studyProcessGraphTokens
    |> List.tryFind (fun cvpList -> cvpList.Head |> Param.getValueAsTerm = (CvTerm.create("OBI:0100026","organism","OBI")))
    |> Option.defaultValue []

let tissueTokens =
    studyProcessGraphTokens
    |> List.tryFind (fun cvpList -> cvpList.Head |> Param.getValueAsTerm = (CvTerm.create("NCIT:C12801","Tissue","NCIT")))
    |> Option.defaultValue []

let instrumentTokens =
    assayProcessGraphTokens
    |> List.tryFind (fun cvpList -> cvpList.Head |> Param.getValueAsTerm = (CvTerm.create("MS:1000031","instrument model","MS")))
    |> Option.defaultValue []

let modTokens =
    assayProcessGraphTokens
    |> List.filter (
        fun cvpList -> 
            cvpList.Head |> Param.getValueAsTerm = (CvTerm.create("MS:1003021","Fixed modification","MS")) ||
            cvpList.Head |> Param.getValueAsTerm = (CvTerm.create("MS:1003022","Variable modification","MS"))
    )
    |> List.concat

let techTypeName = CvTerm.create("ASSMSO:00000011", "Assay Technology Type", "ASSMSO")
let techTypeTAN = CvTerm.create("ASSMSO:00000013", "Assay Technology Type Term Accession Number", "ASSMSO")
let techTypeTSR = CvTerm.create("ASSMSO:00000015", "Assay Technology Type Term Source REF", "ASSMSO")


// Helper functions (to deposit in ARCExpect later):

open System.Text


let characterLimit (lowerLimit : int option) (upperLimit : int option) =
    match lowerLimit, upperLimit with
    | None, None -> System.Text.RegularExpressions.Regex(@"^.{0,}$")
    | Some ll, None -> System.Text.RegularExpressions.Regex($"^.{{{ll},}}$")
    | None, Some ul -> System.Text.RegularExpressions.Regex($"^.{{0,{ul}}}$")
    | Some ll, Some ul -> System.Text.RegularExpressions.Regex($"^.{{{ll},{ul}}}$")


type ErrorMessage with

    static member ofIParamCollection error iParamCollection =

        let iParam = Seq.head iParamCollection

        let str = new StringBuilder()    
        str.AppendFormat("['{0}', ..] {1}\n",  Param.getCvName iParam, error) |> ignore 

        match Param.tryGetValueOfCvParamAttr "FilePath" iParam with
        | Some path ->
            str.AppendFormat(" > filePath '{0}'\n", path) |> ignore         
        | None -> ()

        match Param.tryGetValueOfCvParamAttr "Worksheet" iParam with
        | Some sheet ->
            str.AppendFormat(" > sheet '{0}'", sheet) |> ignore         
        | None -> ()

        match Param.tryGetValueOfCvParamAttr "Row" iParam with
        | Some row -> 
            str.AppendFormat(" > row '{0}'", row) |> ignore
        | None -> ()

        match Param.tryGetValueOfCvParamAttr "Column" iParam with
        | Some column -> 
            str.AppendFormat(" > column '{0}'", column) |> ignore
        | None -> ()        
                
        match Param.tryGetValueOfCvParamAttr "Line" iParam with
        | Some line ->
            str.AppendFormat(" > line '{0}'", line) |> ignore
        | None -> ()

        match Param.tryGetValueOfCvParamAttr "Position" iParam with
        | Some position -> 
            str.AppendFormat(" > position '{0}'", position) |> ignore
        | None -> ()
        str.ToString()


type Validate.ParamCollection with

    static member AllTermsSatisfyPredicate (projection : #IParam -> bool) (paramCollection : #seq<#IParam>) =
        match Seq.forall projection paramCollection with
        | true  -> ()
        | false ->
            ErrorMessage.ofIParamCollection $"does not satisfy the requirements" paramCollection
            |> Expecto.Tests.failtestNoStackf "%s"


// Validation Cases:
let investigationCases = 
    testList INVMSO.``Investigation Metadata``.INVESTIGATION.key.Name [
        // Investigation has title
        ARCExpect.validationCase (TestID.Name INVMSO.``Investigation Metadata``.INVESTIGATION.``Investigation Title``.Name) {
            investigationMetadata
            |> Validate.ParamCollection.ContainsNonKeyParamWithTerm
                INVMSO.``Investigation Metadata``.INVESTIGATION.``Investigation Title``
        }

        // Investigation has description
        ARCExpect.validationCase (TestID.Name INVMSO.``Investigation Metadata``.INVESTIGATION.``Investigation Description``.Name) {
            investigationMetadata
            |> Validate.ParamCollection.ContainsNonKeyParamWithTerm
                INVMSO.``Investigation Metadata``.INVESTIGATION.``Investigation Description``
        }

        // Investigation has contacts with name, last name, affiliation and email
        // Investigation Person First Name
        ARCExpect.validationCase (TestID.Name $"{INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person First Name``.Name} exists") {
            investigationMetadata
            |> Validate.ParamCollection.ContainsNonKeyParamWithTerm INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person First Name``
        }
        ARCExpect.validationCase (TestID.Name $"{INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person First Name``.Name} is not empty") {
            investigationMetadata
            |> Seq.filter (Param.getTerm >> (=) INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person First Name``)
            |> Seq.iter Validate.Param.ValueIsNotEmpty
        }

        // Investigation Person Last Name
        ARCExpect.validationCase (TestID.Name $"{INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Last Name``.Name} exists") {
            investigationMetadata
            |> Validate.ParamCollection.ContainsNonKeyParamWithTerm INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Last Name``
        }
        ARCExpect.validationCase (TestID.Name $"{INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Last Name``.Name} is not empty") {
            investigationMetadata
            |> Seq.filter (Param.getTerm >> (=) INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Last Name``)
            |> Seq.iter Validate.Param.ValueIsNotEmpty
        }

        // Investigation Person Affiliation
        ARCExpect.validationCase (TestID.Name $"{INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Affiliation``.Name} exists") {
            investigationMetadata
            |> Validate.ParamCollection.ContainsNonKeyParamWithTerm INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Affiliation``
        }
        ARCExpect.validationCase (TestID.Name $"{INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Affiliation``.Name} is not empty") {
            investigationMetadata
            |> Seq.filter (Param.getTerm >> (=) INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Affiliation``)
            |> Seq.filter (Param.getValueAsString >> (<>) "Metadata Section Key")
            |> Seq.iter Validate.Param.ValueIsNotEmpty
        }

        // Investigation Person Email
        ARCExpect.validationCase (TestID.Name $"{INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Email``.Name} exists") {
        investigationMetadata
        |> Validate.ParamCollection.ContainsNonKeyParamWithTerm INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Email``
        }
        ARCExpect.validationCase (TestID.Name INVMSO.``Investigation Metadata``. ``INVESTIGATION CONTACTS``.``Investigation Person Email``.Name) {
            investigationMetadata
            |> Seq.filter (Param.getTerm >> (=) INVMSO.``Investigation Metadata``.``INVESTIGATION CONTACTS``.``Investigation Person Email``)
            |> Seq.filter (Param.getValueAsString >> (<>) "Metadata Section Key")
            |> Seq.iter (Validate.Param.ValueMatchesRegex StringValidationPattern.email)
        }
    ]

let studyCases =
    testList STDMSO.``Study Metadata``.STUDY.key.Name [
        // Study has protocol in correct format
        ARCExpect.validationCase (TestID.Name STDMSO.``Study Metadata``.``STUDY PROTOCOLS``.key.Name) {
            studyMetadata
            |> Validate.ParamCollection.ContainsParamWithTerm STDMSO.``Study Metadata``.``STUDY PROTOCOLS``.key
        }

        // Study protocol is in correct format
        ARCExpect.validationCase (TestID.Name "STUDY PROTOCOLS description") {
            studyMetadata|> Seq.filter (fun iparam -> Param.getTerm iparam = STDMSO.``Study Metadata``.``STUDY PROTOCOLS``.``Study Protocol Description``)
            |> Seq.filter (Param.getValueAsString >> (<>) "Metadata Section Key")
            |> Seq.iter (Validate.Param.ValueMatchesRegex (characterLimit (Some 50) (Some 500)))
        }

        // Study has tissue header in process graph
        ARCExpect.validationCase (TestID.Name "Tissue") {
            tissueTokens
            |> Validate.ParamCollection.SatisfiesPredicate (
                fun iparams ->
                    iparams
                    |> List.exists (fun iparam -> iparam.Value = ParamValue.CvValue (CvTerm.create("NCIT:C12801","Tissue","NCIT")))
            )
        }

        // Study has tissue values as terms (CvValues)
        ARCExpect.validationCase (TestID.Name "Tissue terms") {
            tissueTokens
            |> Validate.ParamCollection.AllTermsSatisfyPredicate (fun ip -> match ip.Value with CvValue _ -> true | _ -> false)
        }

        // Study has species in correct format
        ARCExpect.validationCase (TestID.Name "Organism") {
            organismTokens
            |> Validate.ParamCollection.SatisfiesPredicate (
                fun iparams ->
                    iparams
                    |> List.exists (fun iparam -> iparam.Value = ParamValue.CvValue (CvTerm.create("OBI:0100026","organism","OBI")))
            )
        }

        // Study has species values as terms (CvValues)
        ARCExpect.validationCase (TestID.Name "Organism terms") {
            tissueTokens
            |> Validate.ParamCollection.AllTermsSatisfyPredicate (fun ip -> match ip.Value with CvValue _ -> true | _ -> false)
        }
    ]

let assayCases =
    testList ASSMSO.``Assay Metadata``.ASSAY.key.Name [

        // Assay has technology type in correct format
        ARCExpect.validationCase (TestID.Name techTypeTAN.Name) {
            assayMetadata
            |> Validate.ParamCollection.ContainsParamWithTerm techTypeTAN
        }
        ARCExpect.validationCase (TestID.Name techTypeName.Name) {
            assayMetadata
            |> Validate.ParamCollection.ContainsParamWithTerm techTypeName
        }
        ARCExpect.validationCase (TestID.Name techTypeTSR.Name) {
            assayMetadata
            |> Validate.ParamCollection.ContainsParamWithTerm techTypeTSR
        }

        // Assay has instrument model
        ARCExpect.validationCase (TestID.Name "instrument model") {
            instrumentTokens
            |> Validate.ParamCollection.SatisfiesPredicate (
                fun iparams ->
                    iparams
                    |> List.exists (fun iparam -> iparam.Value = ParamValue.CvValue (CvTerm.create("MS:1000031","instrument model","MS")))
            )
        }

        // Assay has instrument model in correct format
        ARCExpect.validationCase (TestID.Name "instrument model terms") {
            instrumentTokens
            |> Validate.ParamCollection.AllTermsSatisfyPredicate (fun ip -> match ip.Value with CvValue _ -> true | _ -> false)
        }

        // Assay has fixed or variable modification in correct format
        ARCExpect.validationCase (TestID.Name "modification") {
            modTokens
            |> Validate.ParamCollection.SatisfiesPredicate (
                fun iparams ->
                    iparams
                    |> List.exists (
                        fun iparam -> 
                            iparam.Value = ParamValue.CvValue (CvTerm.create("MS:1003021","Fixed modification","MS")) ||
                            iparam.Value = ParamValue.CvValue (CvTerm.create("MS:1003022","Variable modification","MS"))
                    )
            )
        }
    ]


// Execution:

Setup.ValidationPackage(
    metadata = Setup.Metadata(PACKAGE_METADATA),
    CriticalValidationCases = [investigationCases; studyCases; assayCases]
)
|> Execute.ValidationPipeline(
    basePath = arcDir
)

Available versions
Version Released on
1.0.3 11/15/2024
1.0.2 11/15/2024