JSON Schemas

This appendix provides a reference for the JSON schema documents used by ibi Data Quality for data analysis results.

Profiling Results JSON Schema

{

	"$schema": "https://json-schema.org/draft/2020-12/schema",

	"$id": "https://ibi.com/dq/schemas/profile",

	"title": "Profile",

	"description": "Profile of the dataset",

	"inputDataSetName": {

		"type": "string",

		"description": "Name of the input data set"

	},

	"createdBy": {

	"type": "string",

	"description": "User account that executed profiling analysis 
and created the data profile"

	},

	"createdDate": {

		"type": "string",

		"description": "Date and time when the data profile is 
created"

	},

	"countRows": {

		"type": "integer",

		"description": "Number of rows analyzed"

	},

	"countVariables": {

		"type": "integer",

		"description": "Number of columns analyzed"

	},

	"countObservations": {

	"type": "integer",

	"description": "Number of observations (rows times columns) 
analyzed"

	},

	"countDuplicateRows": {

		"type": "integer",

		"description": "Number of duplicate rows in the data set"

	},

	"pctDuplicateRows": {

		"type": "number",

		"description": "Percentage of total rows that are duplicate 
rows in the data set"

	},

	"countDuplicateRowsRemoved": {

		"type": "integer",

		"description": "Number of duplicate rows removed in the 
deduplication process"

	},

	"pctDuplicateRowsRemoved": {

		"type": "number",

		"description": "Percentage of total rows that were duplicate 
and were removed in the deduplication process"

	},

	"countRowsAfterDeduplication": {

		"type": "number",

		"description": "Number of total rows that remain in the data 
set after the deduplication process"

	},

	
"countMissing": {

		"type": "integer",

		"description": "Number of observations that are missing 
(blank or null) in the data set"

	},

	"pctMissing": {

		"type": "number",

		"description": "Percentage of total observations that are 
missing (blank or null) in the data set"

	},

	"profileScore": {

		"type": "number",

		"description": "Data profile score for the entire data set"

	},

	"variables": {

		"type": "array",

		"items": {

			"type": "object",

			"properties": {

				"name": {

					"type": "string",

					"description": "Name of the variable"

				},

				"type": {

					"enum": [

						"string",

						"number",

						"date"

					],

				"description": "Data type of the variable"

				},

				"count": {

				"type": "integer",

				"description": "Total number of observations"

				},

				"countDistinct": {

					"type": "integer",

					"description": "Total number of 
distinct observations, distinct includes null and blank values"

				},

				"countUnique": {

					"type": "integer",

					"description": "Total number of 
unique observations, unique does not include null or blank values"

				},

				"countNonUnique": {

					"type": "integer",

					"description": "Total number of 
non-unique observations"

				},

				"countNulls": {

					"type": "integer",

					"description": "Total number of null 
observations"

				},

				"countNotNulls": {

					"type": "integer",

					"description": "Total number of 
non-null observations"

				},

				
"countBlanks": {

					"type": "integer",

					"description": "Total number of blank 
observations"

				},

				"pctUnique": {

					"type": "number",

					"description": "Percentage of total 
observations that are unique"

				},

				"pctComplete": {

					"type": "number",

					"description": "Percentage of 
total observations that are not null or blank"

				},

				"sensitive": {

					"type": "boolean",

					"description": "true if the variable 
contains sensitive data"

				},

			"frequency": {

			"type": "array",

			"description": "Frequency of values",

			"items": {

			"type": "object",

				"properties": {

				"value": {

				"type": "string",

				"description": "Observation value"

					},

				"count": {

				"type": "integer",

		"description": "Number of observations that has this value"

							},

			"pct": {

			"type": "number",

			"description": "Percentage of total observations that
 has this value"

							}

						}

					}

				},

				"lengths": {

				"type": "object",

				"description": "Character length of the 
observation values",

				"properties": {

					"min": {

					"type": "integer",

					"description": "Minimum length of all 
the observation values"

					},

					"max": {

						"type": "integer",

						"description": "Maximum 
length of all the observation values"

						},

						
"median": {

				"type": "number",

				"description": "Median length of all the 
observation values"

					},

				"avg": {

					"type": "number",

					"description": "Average length 
of all the observation values"

						}

					}

				},

			"patterns": {

			"type": "array",

			"items": {

					"type": "object",

					"description": "Patterns discovered 
from the observation values",

					"properties": {

					"value": {

					"type": "string",

					"description": "Pattern value"

						},

					"count": {

					"type": "integer",

					"description": "Total number of
 observation values that have this pattern"

					},

					"pct": {

					"type": "number",

					"description": "Percentage of 
total observation values that have this pattern"

						}

					}

				 }

				},

			"masks": {

			"type": "array",

			"items": {

				"type": "object",

				"description": "Masks discovered from the 
observation values",

				"properties": {

					"value": {

					"type": "string",

					"description": "Mask value"

						},

					"count": {

						"type": "integer",

						"description": "Total number 
of observation values that have this mask"

							},

						
"pct": {

			"type": "number",

			"description": "Percentage of total observation 
values that have this mask"

							}

						}

					}

				},

			"contentTypes": {

			"type": "array",

			"description": "Data classes discovered from the 
observation values",

				"items": {

				"type": "object",

				"properties": {

				"value": {

				"type": "string",

				"description": "Name of the data class"

						},

					"count": {

						"type": "integer",

						"description": "Total 
number of observation values that belong to this data class"

							},

					"pct": {

					"type": "number",

					"description": "Percentage of 
total observation values that belong to this data class"

							}

						}

					}

				},

		"numeric": {

		"type": "array",

		"description": "Numerical statistics of the observation values",

			"items": {

			"type": "object",

				"properties": {

					"variable": {

					"type": "string",

					"description": "Name of the variable"

					},

				"avg": {

				"type": "number",

				"description": "Average value"

					},

				"min": {

				"type": "number",

					"description": "Minimum value"

							},

						
"max": {

					"type": "number",

					"description": "Maximum value"

					},

				"stdDev": {

				"type": "number",

				"description": "Standard deviation"

						},

					"pctl25": {

					"type": "number",

					"description": "25th percentile"

					},

					"pctl50": {

					"type": "number",

					"description": "50th percentile"

							},

					"pctl75": {

					"type": "number",

					"description": "75th percentile"

					}

						}

					}

				},

			"profileScore": {

			"type": "number",

			"description": "Profile score for the individual
 variable"

				}

			}

		}

	},

	"notNullsSvg": {

		"type": "string",

		"description": "XML representing the completeness chart of the 
data set"

	},

	"variableOptions": {

		"type": "array",

		"items": {

			"type": "object",

			"description": "Data expectations set by the user",

			"properties": {

				"id": {

					"type": "string",

					"description": "Name of the variable"

				},

			"businessImpact": {

				"type": "string",

				"description": "A number that represents 
the impact (HIGH, MEDIUM or LOW) of the variable on business outcomes"

				
"allowsNulls": {

	"type": "string",

	"description": "True if null values are expected for this variable"

		},

		"shouldBeUnique": {

		"type": "string",

		"description": "True if only unique values are expected 
for this variable"

				}

			}

		}

	}

}

K-Means Cluster Analysis Results JSON Schema

{

	"$schema": "https://json-schema.org/draft/2020-12/schema",

	"$id": "https://ibi.com/dq/schemas/cluster_analysis/kmeans",

	"title": "KMeans Cluster Analysis",

	"description": "KMeans cluster analysis of two variables in a given 
data set",

	"type": "object",

	"properties": {

		"method": {

			"type": "string",

			"const": "kmeans",

			"description": "Cluster analysis method"

		},

		"count": {

			"type": "integer",

			"description": "Number of rows analyzed"

		},

		"variables": {

			"type": "array",

			"description": "Name of the variables",

			"items": {

				"type": "string",

				"minItems": 2,

				"maxItems": 2,

				"uniqueItems": true

			}

		},

		"svg": {

			"type": "string",

			"description": "Cluster analysis SVG graph"

		}

	}

}

Correlation Analysis Results JSON Schema

{

	"$schema": "https://json-schema.org/draft/2020-12/schema",

	"$id": "https://ibi.com/dq/schemas/correlations",

	"title": "Correlation Analysis",

	"description": "Correlation analysis of variables in a dataset",

	"type": "array",

	"items": {

		"type": "object",

		"properties": {

			"method": {

				"type": "string",

				"enum": [

					"kendall",

					"pearson",

					"spearman"

				],

				"description": "Cluster analysis method"

			},

			"count": {

				"type": "integer",

				"description": "Number of rows analyzed"

			},

			"variables": {

				"type": "array",

				"description": "Names of the variables",

				"items": {

					"type": "string",

					"uniqueItems": true

				}

			},

			"coefficients": {

			"type": "array",

			"description": "Correlation coefficients for a 
pair of variables",

				"items": {

					"type": "array",

					"items": {

						"type": "number",

						"minimum": -1.0,

						"maximum": 1.0

					}

				}

			},

			"svg": {

				"type": "string",

				"description": "Correlation analysis SVG graph"

			}

		}

	}

}

DQ Analysis Summary Results JSON Schema

{

	"$schema": "https://json-schema.org/draft/2020-12/schema",

	
"id": "urn:jsonschema:com:ibi:dq:common:model:butler:CleansingResults",

	"type": "object",

	"description": "Summarized results of Rules driven data quality 
analysis",

	"properties": {

	"inputFile": {

	"type": "string",

	"description": "File name and location of the input data set"

	},

	"outputFile": {

	"type": "string",

	"description": "File name and location of the output data set"

	},

		"dqScore": {

			"type": "integer",

	"description": "Data quality score for the data set"

		},

		"results": {

		"type": "array",

		"description": "Results for each DQ Rule executed against a 
variable or group of variables",

	"items": {

	"type": "object",

	"id": "urn:jsonschema:com:ibi:dq:common:model:butler:Summary",

	"properties": {

	"fileName": {

		"type": "string",

		"description": "File name and location of the cleansed 
output generated for this combination of Variable or Group of Variables 
and DQ Rule"

					},

					"ruleId": {

						"type": "string",

						"description": "Unique 
identifier of the DQ Rule"

				},

			"groupName": {

			"type": "string",

			"description": "Name of the Variable or Group of 
Variables"

					},

					"dqScore": {

						"type": "integer",

						"description": "Data quality 
score for this combination of Variable or Group of Variables and DQ Rule"

					},

					"tags": {

						"type": "object",

						"additionalProperties": {

						"type": "integer",

						"description": "Tag values
 generated during the DQ analysis"

						}

					},

	"countCleansed": {

			"type": "integer",

			"description": "Total number of cleansed values"

			},

			"countMissing": {

			"type": "integer",

			"description": "Total number of missing values"

			},

			"countProcessed": {

			"type": "integer",

			"description": "Total number of processed values"

					},

			"countInvalid": {

			"type": "integer",

			"description": "Total number of invalid values"

					},

				"countValid": {

				"type": "integer",

				"description": "Total number of valid values"

					}

				}

			}

		},

	"businessContext": {

	"type": "object",

	"id": "urn:jsonschema:com:ibi:dq:common:model:dataset:DataSetContext",

	"properties": {

	"id": {

		"type": "string",

		"description": "Name of the variable"

		},

		"businessImpact": {

			"type": "string",

			"description": "A number that represents the 
impact (HIGH, MEDIUM or LOW) of the variable on business outcomes"

				},

				"allowsNulls": {

					"type": "string",

					"description": "True if null values 
are expected for this variable"

				},

				"shouldBeUnique": {

					"type": "string",

					"description": "True if only unique 
values are expected for this variable"

				}

			}

		}

	}

}