Validation Package hhu-cmml

ARC; CMML

v0.0.1 released on 05/23/2025

by Dominik Brilhaus

65 Downloads

Validates if the ARC contains the necessary metadata for plant sample submission to HHU CMML.


Install with arc-validate

 arc-validate package install hhu-cmml --version 0.0.1

Include in a PLANTDataHUB CQC pipeline

validation_packages:
  - name: hhu-cmml
    version: 0.0.1

Description

Validates if the ARC contains the necessary metadata for plant sample submission to HHU CMML.

This is validates against "Swate template study sheet for plant samples"

https://str.nfdi4plants.org/template/226689ec-4be3-4143-b775-f8856c8ed6a5

The following metadata is required (may not be empty):

- Input [Source Name]

- Parameter [sample submission date]

- Characteristic [organism] OBI:0100026

The following metadata is recommended:

- Characteristic [biological replicate] DPBO:0000042

- Characteristic [organism part] EFO:0000635

- Characteristic [plant age] DPBO:0000033

- Characteristic [genotype] EFO:0000513

- Parameter [normalisation factor]

- Parameter [dry weight]

- Parameter [fresh weight]

- Characteristic [resuspension volume]

- Characteristic [resuspension solution]


Release notes

- First version of CMML plant sample sheet validation package


Browse code (v0.0.1)
let [<Literal>]PACKAGE_METADATA = """(*
---
Name: hhu-cmml
Summary: Validates if the ARC contains the necessary metadata for plant sample submission to HHU CMML.
Description: |
  Validates if the ARC contains the necessary metadata for plant sample submission to HHU CMML.
  This is validates against "Swate template study sheet for plant samples" 
  https://str.nfdi4plants.org/template/226689ec-4be3-4143-b775-f8856c8ed6a5
  The following metadata is required (may not be empty):
    - Input [Source Name]
    - Parameter [sample submission date]
    - Characteristic [organism] OBI:0100026
  The following metadata is recommended:
    - Characteristic [biological replicate] DPBO:0000042
    - Characteristic [organism part] EFO:0000635
    - Characteristic [plant age] DPBO:0000033
    - Characteristic [genotype] EFO:0000513
    - Parameter [normalisation factor]
    - Parameter [dry weight]
    - Parameter [fresh weight]
    - Characteristic [resuspension volume]
    - Characteristic [resuspension solution]
MajorVersion: 0
MinorVersion: 0
PatchVersion: 1
Publish: true
Authors:
  - FullName: Dominik Brilhaus
    Affiliation: CEPLAS
    AffiliationLink: https://ceplas.eu
Tags:
  - Name: ARC
  - Name: CMML
ReleaseNotes: |
  - First version of CMML plant sample sheet validation package
---
*)"""

#r "nuget: ARCExpect, 4.0.0"

open ControlledVocabulary
open Expecto
open ARCExpect
open ARCTokenization
open ARCTokenization.StructuralOntology
open System.IO
open System.Text
open FSharpAux

// Input:
let arcDir = Directory.GetCurrentDirectory()

// TEST
// let arcDir = @"/path/to/arc"

// Values:
let absoluteDirectoryPaths = FileSystem.parseARCFileSystem arcDir

let studyFiles = 
    try 
        absoluteDirectoryPaths
        |> Study.parseProcessGraphColumnsFromTokens arcDir
    with
        | _ -> seq{Map.empty}

// Check whether a building block with ontology column header contains values
let containsFilledOutColumnCVT (cvt : CvTerm) (tokenColumns : IParam list list) =
    let column = 
        tokenColumns 
        |> Seq.tryFind (fun column ->
            Param.getValueAsTerm column.Head = cvt
        )
    match column with
    | Some (h :: []) -> Expecto.Tests.failtestNoStackf $"{cvt.Name} column only contains header"            
    | Some (h :: vals) -> 
        vals
        |> List.iteri (fun i token ->
            if (Param.getValueAsTerm token).Name = "" then
                Expecto.Tests.failtestNoStackf $"column {cvt.Name} contains empty value at index {i}"                  
        )
    | _ -> Expecto.Tests.failtestNoStackf $"table contains no {cvt.Name} header"

// Check whether a building block without ontology column header (i.e. only name) contains values
let containsFilledOutColumnName (name : string) (tokenColumns : IParam list list) =
    let column = 
        tokenColumns 
        |> Seq.tryFind (fun column ->
            (Param.getValueAsTerm column.Head).Name = name
        )
    match column with
    | Some (h :: []) -> Expecto.Tests.failtestNoStackf $"{name} column only contains header"            
    | Some (h :: vals) -> 
        vals
        |> List.iteri (fun i token ->
            if (Param.getValueAsTerm token).Name = "" then
                Expecto.Tests.failtestNoStackf $"column {name} contains empty value at index {i}"                  
        )
    | _ -> Expecto.Tests.failtestNoStackf $"table contains no {name} header" 



let plantGrowthSheets = 
    studyFiles
    |> Seq.collect (fun s ->
        s
        |> Seq.filter (fun kv ->
            kv.Value 
            |> Seq.concat
            |> Seq.exists (fun token -> 
                token.Name = "ProtocolType" 
                && 
                (Param.getValueAsTerm token).Name = "plant growth protocol"
            )
        )   
    )


// Validation Cases:
let cases = 
    testList "cases" [  // naming is difficult here

        ARCExpect.validationCase (TestID.Name "plant growth table") {
            if plantGrowthSheets |> Seq.isEmpty then
                Expecto.Tests.failtestNoStackf "No plant growth table found"            
        }

        if plantGrowthSheets |> Seq.isEmpty |> not then
            for table in plantGrowthSheets do
            // ARCExpect.validationCase (TestID.Name $"{table.Key}: Source Name") {
            //     table.Value
            //     |> containsFilledOutColumnName "Source Name"
            // }
            ARCExpect.validationCase (TestID.Name $"{table.Key}: organism") {
                table.Value
                |> containsFilledOutColumnCVT (CvTerm.create("OBI:0100026","organism","OBI"))
            }
            ARCExpect.validationCase (TestID.Name $"{table.Key}: sample submission date") {
                table.Value
                |> containsFilledOutColumnName "sample submission date"
            }
    ]


let nonCriticalCases = 
    testList "cases" [  // naming is difficult here

        if plantGrowthSheets |> Seq.isEmpty |> not then
            for table in plantGrowthSheets do
               
                ARCExpect.validationCase (TestID.Name $"{table.Key}: biological replicate") {
                    table.Value
                    |> containsFilledOutColumnCVT (CvTerm.create("DPBO:0000042","biological replicate","DPBO"))
                }
                ARCExpect.validationCase (TestID.Name $"{table.Key}: organism part") {
                    table.Value
                    |> containsFilledOutColumnCVT (CvTerm.create("EFO:0000635","organism part","EFO"))
                }
                ARCExpect.validationCase (TestID.Name $"{table.Key}: plant age") {
                    table.Value
                    |> containsFilledOutColumnCVT (CvTerm.create("DPBO:0000033","plant age","DPBO"))
                }
                ARCExpect.validationCase (TestID.Name $"{table.Key}: genotype") {
                    table.Value
                    |> containsFilledOutColumnCVT (CvTerm.create("EFO:0000513","genotype","EFO"))
                }
                ARCExpect.validationCase (TestID.Name $"{table.Key}: normalisation factor") {
                    table.Value
                    |> containsFilledOutColumnName "normalisation factor"
                }
                ARCExpect.validationCase (TestID.Name $"{table.Key}: dry weight") {
                    table.Value
                    |> containsFilledOutColumnName "dry weight"
                }
                ARCExpect.validationCase (TestID.Name $"{table.Key}: fresh weight") {
                    table.Value
                    |> containsFilledOutColumnName "fresh weight"
                }
                ARCExpect.validationCase (TestID.Name $"{table.Key}: resuspension volume") {
                    table.Value
                    |> containsFilledOutColumnName "resuspension volume"
                }
                ARCExpect.validationCase (TestID.Name $"{table.Key}: resuspension solution") {
                    table.Value
                    |> containsFilledOutColumnName "resuspension solution"
                }
    ]

// Execution:
Setup.ValidationPackage(
    metadata = Setup.Metadata(PACKAGE_METADATA),
    CriticalValidationCases = [cases],
    NonCriticalValidationCases = [nonCriticalCases]
)
|> Execute.ValidationPipeline(
    basePath = arcDir
)

Available versions
Version Released on
0.0.1 05/23/2025