JSON Schemas
This appendix provides a reference for the JSON schema documents used by ibi Data Quality for data analysis results.
Profiling Results JSON Schema
{ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://ibi.com/dq/schemas/profile", "title": "Profile", "description": "Profile of the dataset", "inputDataSetName": { "type": "string", "description": "Name of the input data set" }, "createdBy": { "type": "string", "description": "User account that executed profiling analysis
and created the data profile" }, "createdDate": { "type": "string", "description": "Date and time when the data profile is
created" }, "countRows": { "type": "integer", "description": "Number of rows analyzed" }, "countVariables": { "type": "integer", "description": "Number of columns analyzed" }, "countObservations": { "type": "integer", "description": "Number of observations (rows times columns)
analyzed" }, "countDuplicateRows": { "type": "integer", "description": "Number of duplicate rows in the data set" }, "pctDuplicateRows": { "type": "number", "description": "Percentage of total rows that are duplicate
rows in the data set" }, "countDuplicateRowsRemoved": { "type": "integer", "description": "Number of duplicate rows removed in the
deduplication process" }, "pctDuplicateRowsRemoved": { "type": "number", "description": "Percentage of total rows that were duplicate
and were removed in the deduplication process" }, "countRowsAfterDeduplication": { "type": "number", "description": "Number of total rows that remain in the data
set after the deduplication process" },
"countMissing": { "type": "integer", "description": "Number of observations that are missing
(blank or null) in the data set" }, "pctMissing": { "type": "number", "description": "Percentage of total observations that are
missing (blank or null) in the data set" }, "profileScore": { "type": "number", "description": "Data profile score for the entire data set" }, "variables": { "type": "array", "items": { "type": "object", "properties": { "name": { "type": "string", "description": "Name of the variable" }, "type": { "enum": [ "string", "number", "date" ], "description": "Data type of the variable" }, "count": { "type": "integer", "description": "Total number of observations" }, "countDistinct": { "type": "integer", "description": "Total number of
distinct observations, distinct includes null and blank values" }, "countUnique": { "type": "integer", "description": "Total number of
unique observations, unique does not include null or blank values" }, "countNonUnique": { "type": "integer", "description": "Total number of
non-unique observations" }, "countNulls": { "type": "integer", "description": "Total number of null
observations" }, "countNotNulls": { "type": "integer", "description": "Total number of
non-null observations" },
"countBlanks": { "type": "integer", "description": "Total number of blank
observations" }, "pctUnique": { "type": "number", "description": "Percentage of total
observations that are unique" }, "pctComplete": { "type": "number", "description": "Percentage of
total observations that are not null or blank" }, "sensitive": { "type": "boolean", "description": "true if the variable
contains sensitive data" }, "frequency": { "type": "array", "description": "Frequency of values", "items": { "type": "object", "properties": { "value": { "type": "string", "description": "Observation value" }, "count": { "type": "integer", "description": "Number of observations that has this value" }, "pct": { "type": "number", "description": "Percentage of total observations that
has this value" } } } }, "lengths": { "type": "object", "description": "Character length of the
observation values", "properties": { "min": { "type": "integer", "description": "Minimum length of all
the observation values" }, "max": { "type": "integer", "description": "Maximum
length of all the observation values" },
"median": { "type": "number", "description": "Median length of all the
observation values" }, "avg": { "type": "number", "description": "Average length
of all the observation values" } } }, "patterns": { "type": "array", "items": { "type": "object", "description": "Patterns discovered
from the observation values", "properties": { "value": { "type": "string", "description": "Pattern value" }, "count": { "type": "integer", "description": "Total number of
observation values that have this pattern" }, "pct": { "type": "number", "description": "Percentage of
total observation values that have this pattern" } } } }, "masks": { "type": "array", "items": { "type": "object", "description": "Masks discovered from the
observation values", "properties": { "value": { "type": "string", "description": "Mask value" }, "count": { "type": "integer", "description": "Total number
of observation values that have this mask" },
"pct": { "type": "number", "description": "Percentage of total observation
values that have this mask" } } } }, "contentTypes": { "type": "array", "description": "Data classes discovered from the
observation values", "items": { "type": "object", "properties": { "value": { "type": "string", "description": "Name of the data class" }, "count": { "type": "integer", "description": "Total
number of observation values that belong to this data class" }, "pct": { "type": "number", "description": "Percentage of
total observation values that belong to this data class" } } } }, "numeric": { "type": "array", "description": "Numerical statistics of the observation values", "items": { "type": "object", "properties": { "variable": { "type": "string", "description": "Name of the variable" }, "avg": { "type": "number", "description": "Average value" }, "min": { "type": "number", "description": "Minimum value" },
"max": { "type": "number", "description": "Maximum value" }, "stdDev": { "type": "number", "description": "Standard deviation" }, "pctl25": { "type": "number", "description": "25th percentile" }, "pctl50": { "type": "number", "description": "50th percentile" }, "pctl75": { "type": "number", "description": "75th percentile" } } } }, "profileScore": { "type": "number", "description": "Profile score for the individual
variable" } } } }, "notNullsSvg": { "type": "string", "description": "XML representing the completeness chart of the
data set" }, "variableOptions": { "type": "array", "items": { "type": "object", "description": "Data expectations set by the user", "properties": { "id": { "type": "string", "description": "Name of the variable" }, "businessImpact": { "type": "string", "description": "A number that represents
the impact (HIGH, MEDIUM or LOW) of the variable on business outcomes"
"allowsNulls": { "type": "string", "description": "True if null values are expected for this variable" }, "shouldBeUnique": { "type": "string", "description": "True if only unique values are expected
for this variable" } } } } }
K-Means Cluster Analysis Results JSON Schema
{ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://ibi.com/dq/schemas/cluster_analysis/kmeans", "title": "KMeans Cluster Analysis", "description": "KMeans cluster analysis of two variables in a given
data set", "type": "object", "properties": { "method": { "type": "string", "const": "kmeans", "description": "Cluster analysis method" }, "count": { "type": "integer", "description": "Number of rows analyzed" }, "variables": { "type": "array", "description": "Name of the variables", "items": { "type": "string", "minItems": 2, "maxItems": 2, "uniqueItems": true } }, "svg": { "type": "string", "description": "Cluster analysis SVG graph" } } }
Correlation Analysis Results JSON Schema
{ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://ibi.com/dq/schemas/correlations", "title": "Correlation Analysis", "description": "Correlation analysis of variables in a dataset", "type": "array", "items": { "type": "object", "properties": { "method": { "type": "string", "enum": [ "kendall", "pearson", "spearman" ], "description": "Cluster analysis method" }, "count": { "type": "integer", "description": "Number of rows analyzed" }, "variables": { "type": "array", "description": "Names of the variables", "items": { "type": "string", "uniqueItems": true } }, "coefficients": { "type": "array", "description": "Correlation coefficients for a
pair of variables", "items": { "type": "array", "items": { "type": "number", "minimum": -1.0, "maximum": 1.0 } } }, "svg": { "type": "string", "description": "Correlation analysis SVG graph" } } } }
DQ Analysis Summary Results JSON Schema
{ "$schema": "https://json-schema.org/draft/2020-12/schema",
"id": "urn:jsonschema:com:ibi:dq:common:model:butler:CleansingResults", "type": "object", "description": "Summarized results of Rules driven data quality
analysis", "properties": { "inputFile": { "type": "string", "description": "File name and location of the input data set" }, "outputFile": { "type": "string", "description": "File name and location of the output data set" }, "dqScore": { "type": "integer", "description": "Data quality score for the data set" }, "results": { "type": "array", "description": "Results for each DQ Rule executed against a
variable or group of variables", "items": { "type": "object", "id": "urn:jsonschema:com:ibi:dq:common:model:butler:Summary", "properties": { "fileName": { "type": "string", "description": "File name and location of the cleansed
output generated for this combination of Variable or Group of Variables
and DQ Rule" }, "ruleId": { "type": "string", "description": "Unique
identifier of the DQ Rule" }, "groupName": { "type": "string", "description": "Name of the Variable or Group of
Variables" }, "dqScore": { "type": "integer", "description": "Data quality
score for this combination of Variable or Group of Variables and DQ Rule" }, "tags": { "type": "object", "additionalProperties": { "type": "integer", "description": "Tag values
generated during the DQ analysis" } },
"countCleansed": { "type": "integer", "description": "Total number of cleansed values" }, "countMissing": { "type": "integer", "description": "Total number of missing values" }, "countProcessed": { "type": "integer", "description": "Total number of processed values" }, "countInvalid": { "type": "integer", "description": "Total number of invalid values" }, "countValid": { "type": "integer", "description": "Total number of valid values" } } } }, "businessContext": { "type": "object", "id": "urn:jsonschema:com:ibi:dq:common:model:dataset:DataSetContext", "properties": { "id": { "type": "string", "description": "Name of the variable" }, "businessImpact": { "type": "string", "description": "A number that represents the
impact (HIGH, MEDIUM or LOW) of the variable on business outcomes" }, "allowsNulls": { "type": "string", "description": "True if null values
are expected for this variable" }, "shouldBeUnique": { "type": "string", "description": "True if only unique
values are expected for this variable" } } } } }