JSON Schemas
This appendix provides a reference for the JSON schema documents used by ibi Data Quality for data analysis results.
Profiling Results JSON Schema
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://ibi.com/dq/schemas/profile",
"title": "Profile",
"description": "Profile of the dataset",
"inputDataSetName": {
"type": "string",
"description": "Name of the input data set"
},
"createdBy": {
"type": "string",
"description": "User account that executed profiling analysis and created the data profile"
},
"createdDate": {
"type": "string",
"description": "Date and time when the data profile is created"
},
"countRows": {
"type": "integer",
"description": "Number of rows analyzed"
},
"countVariables": {
"type": "integer",
"description": "Number of columns analyzed"
},
"countObservations": {
"type": "integer",
"description": "Number of observations (rows times columns) analyzed"
},
"countDuplicateRows": {
"type": "integer",
"description": "Number of duplicate rows in the data set"
},
"pctDuplicateRows": {
"type": "number",
"description": "Percentage of total rows that are duplicate rows in the data set"
},
"countDuplicateRowsRemoved": {
"type": "integer",
"description": "Number of duplicate rows removed in the deduplication process"
},
"pctDuplicateRowsRemoved": {
"type": "number",
"description": "Percentage of total rows that were duplicate and were removed in the deduplication process"
},
"countRowsAfterDeduplication": {
"type": "number",
"description": "Number of total rows that remain in the data set after the deduplication process" },
"countMissing": {
"type": "integer",
"description": "Number of observations that are missing (blank or null) in the data set"
},
"pctMissing": {
"type": "number",
"description": "Percentage of total observations that are missing (blank or null) in the data set"
},
"profileScore": {
"type": "number",
"description": "Data profile score for the entire data set"
},
"variables": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "Name of the variable"
},
"type": {
"enum": [
"string",
"number",
"date"
],
"description": "Data type of the variable"
},
"count": {
"type": "integer",
"description": "Total number of observations"
},
"countDistinct": {
"type": "integer",
"description": "Total number of distinct observations, distinct includes null and blank values"
},
"countUnique": {
"type": "integer",
"description": "Total number of unique observations, unique does not include null or blank values"
},
"countNonUnique": {
"type": "integer",
"description": "Total number of non-unique observations"
},
"countNulls": {
"type": "integer",
"description": "Total number of null observations"
},
"countNotNulls": {
"type": "integer",
"description": "Total number of non-null observations" },
"countBlanks": {
"type": "integer",
"description": "Total number of blank observations"
},
"pctUnique": {
"type": "number",
"description": "Percentage of total observations that are unique"
},
"pctComplete": {
"type": "number",
"description": "Percentage of total observations that are not null or blank"
},
"sensitive": {
"type": "boolean",
"description": "true if the variable contains sensitive data"
},
"frequency": {
"type": "array",
"description": "Frequency of values",
"items": {
"type": "object",
"properties": {
"value": {
"type": "string",
"description": "Observation value"
},
"count": {
"type": "integer",
"description": "Number of observations that has this value"
},
"pct": {
"type": "number",
"description": "Percentage of total observations that has this value"
}
}
}
},
"lengths": {
"type": "object",
"description": "Character length of the observation values",
"properties": {
"min": {
"type": "integer",
"description": "Minimum length of all the observation values"
},
"max": {
"type": "integer",
"description": "Maximum length of all the observation values" },
"median": {
"type": "number",
"description": "Median length of all the observation values"
},
"avg": {
"type": "number",
"description": "Average length of all the observation values"
}
}
},
"patterns": {
"type": "array",
"items": {
"type": "object",
"description": "Patterns discovered from the observation values",
"properties": {
"value": {
"type": "string",
"description": "Pattern value"
},
"count": {
"type": "integer",
"description": "Total number of observation values that have this pattern"
},
"pct": {
"type": "number",
"description": "Percentage of total observation values that have this pattern"
}
}
}
},
"masks": {
"type": "array",
"items": {
"type": "object",
"description": "Masks discovered from the observation values",
"properties": {
"value": {
"type": "string",
"description": "Mask value"
},
"count": {
"type": "integer",
"description": "Total number of observation values that have this mask" },
"pct": {
"type": "number",
"description": "Percentage of total observation values that have this mask"
}
}
}
},
"contentTypes": {
"type": "array",
"description": "Data classes discovered from the observation values",
"items": {
"type": "object",
"properties": {
"value": {
"type": "string",
"description": "Name of the data class"
},
"count": {
"type": "integer",
"description": "Total number of observation values that belong to this data class"
},
"pct": {
"type": "number",
"description": "Percentage of total observation values that belong to this data class"
}
}
}
},
"numeric": {
"type": "array",
"description": "Numerical statistics of the observation values",
"items": {
"type": "object",
"properties": {
"variable": {
"type": "string",
"description": "Name of the variable"
},
"avg": {
"type": "number",
"description": "Average value"
},
"min": {
"type": "number",
"description": "Minimum value"
},
"max": {
"type": "number",
"description": "Maximum value"
},
"stdDev": {
"type": "number",
"description": "Standard deviation"
},
"pctl25": {
"type": "number",
"description": "25th percentile"
},
"pctl50": {
"type": "number",
"description": "50th percentile"
},
"pctl75": {
"type": "number",
"description": "75th percentile"
}
}
}
},
"profileScore": {
"type": "number",
"description": "Profile score for the individual variable"
}
}
}
},
"notNullsSvg": {
"type": "string",
"description": "XML representing the completeness chart of the data set"
},
"variableOptions": {
"type": "array",
"items": {
"type": "object",
"description": "Data expectations set by the user",
"properties": {
"id": {
"type": "string",
"description": "Name of the variable"
},
"businessImpact": {
"type": "string",
"description": "A number that represents the impact (HIGH, MEDIUM or LOW) of the variable on business outcomes"
"allowsNulls": {
"type": "string",
"description": "True if null values are expected for this variable"
},
"shouldBeUnique": {
"type": "string",
"description": "True if only unique values are expected for this variable" } } } } }
K-Means Cluster Analysis Results JSON Schema
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://ibi.com/dq/schemas/cluster_analysis/kmeans",
"title": "KMeans Cluster Analysis",
"description": "KMeans cluster analysis of two variables in a given data set",
"type": "object",
"properties": {
"method": {
"type": "string",
"const": "kmeans",
"description": "Cluster analysis method"
},
"count": {
"type": "integer",
"description": "Number of rows analyzed"
},
"variables": {
"type": "array",
"description": "Name of the variables",
"items": {
"type": "string",
"minItems": 2,
"maxItems": 2,
"uniqueItems": true
}
},
"svg": {
"type": "string",
"description": "Cluster analysis SVG graph"
}
}
}
Correlation Analysis Results JSON Schema
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://ibi.com/dq/schemas/correlations",
"title": "Correlation Analysis",
"description": "Correlation analysis of variables in a dataset",
"type": "array",
"items": {
"type": "object",
"properties": {
"method": {
"type": "string",
"enum": [
"kendall",
"pearson",
"spearman"
],
"description": "Cluster analysis method"
},
"count": {
"type": "integer",
"description": "Number of rows analyzed"
},
"variables": {
"type": "array",
"description": "Names of the variables",
"items": {
"type": "string",
"uniqueItems": true
}
},
"coefficients": {
"type": "array",
"description": "Correlation coefficients for a pair of variables",
"items": {
"type": "array",
"items": {
"type": "number",
"minimum": -1.0,
"maximum": 1.0
}
}
},
"svg": {
"type": "string",
"description": "Correlation analysis SVG graph"
}
}
}
}
DQ Analysis Summary Results JSON Schema
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"id": "urn:jsonschema:com:ibi:dq:common:model:butler:CleansingResults", "type": "object", "description": "Summarized results of Rules driven data quality
analysis",
"properties": {
"inputFile": {
"type": "string",
"description": "File name and location of the input data set"
},
"outputFile": {
"type": "string",
"description": "File name and location of the output data set"
},
"dqScore": {
"type": "integer",
"description": "Data quality score for the data set"
},
"results": {
"type": "array",
"description": "Results for each DQ Rule executed against a variable or group of variables",
"items": {
"type": "object",
"id": "urn:jsonschema:com:ibi:dq:common:model:butler:Summary",
"properties": {
"fileName": {
"type": "string",
"description": "File name and location of the cleansed output generated for this combination of Variable or Group of Variables
and DQ Rule"
},
"ruleId": {
"type": "string",
"description": "Unique identifier of the DQ Rule"
},
"groupName": {
"type": "string",
"description": "Name of the Variable or Group of Variables"
},
"dqScore": {
"type": "integer",
"description": "Data quality score for this combination of Variable or Group of Variables and DQ Rule"
},
"tags": {
"type": "object",
"additionalProperties": {
"type": "integer",
"description": "Tag valuesgenerated during the DQ analysis" } },
"countCleansed": {
"type": "integer",
"description": "Total number of cleansed values"
},
"countMissing": {
"type": "integer",
"description": "Total number of missing values"
},
"countProcessed": {
"type": "integer",
"description": "Total number of processed values"
},
"countInvalid": {
"type": "integer",
"description": "Total number of invalid values"
},
"countValid": {
"type": "integer",
"description": "Total number of valid values"
}
}
}
},
"businessContext": {
"type": "object",
"id": "urn:jsonschema:com:ibi:dq:common:model:dataset:DataSetContext",
"properties": {
"id": {
"type": "string",
"description": "Name of the variable"
},
"businessImpact": {
"type": "string",
"description": "A number that represents the impact (HIGH, MEDIUM or LOW) of the variable on business outcomes"
},
"allowsNulls": {
"type": "string",
"description": "True if null values are expected for this variable"
},
"shouldBeUnique": {
"type": "string",
"description": "True if only unique values are expected for this variable" } } } } }