spec:
  id: 15673922-abc6-48e0-a226-ca225654355b
  name: gretel_train_5k_samples_spec
  description: ''
  datasetId: 551e6634-c9f9-4d62-9e7d-fef749242c24
  datasetName: gretel_train_5k_samples
  requirements: 'Each data point is a complete SQL query example with 9 fields: domain (single or two-word industry/field name), domain_description (one sentence describing the domain''s data scope), sql_complexity (categorizing query complexity level), sql_complexity_description (brief explanation of the complexity level), sql_task_type (categorizing the query''s purpose), sql_task_type_description (brief explanation of the task type), sql_prompt (natural language request for what to retrieve/do), sql_context (CREATE TABLE and INSERT statements setting up the database schema and sample data), sql (the SQL query answering the prompt), and sql_explanation (explanation of how the query works). The sql_context must contain valid SQL DDL and DML statements. The sql field must contain a valid SQL query that would execute against the schema in sql_context. The sql_explanation must describe the query''s logic. All queries focus on realistic business/organizational data scenarios.'
  spec_version: '1'
  data_property_variations:
    - base_distributions:
        agricultural technology: 4
        aviation operations: 3
        biotechnology research: 3
        construction management: 3
        cybersecurity operations: 4
        defense systems: 3
        e-commerce: 3
        education technology: 4
        environmental monitoring: 3
        financial trading: 4
        food safety: 3
        government services: 3
        healthcare analytics: 3
        hospitality management: 3
        insurance operations: 3
        manufacturing operations: 3
        maritime logistics: 3
        media streaming: 4
        mining operations: 3
        nonprofit fundraising: 3
        pharmaceutical research: 4
        real estate management: 3
        renewable energy: 4
        retail analytics: 3
        smart cities: 4
        space exploration: 3
        sports analytics: 3
        supply chain logistics: 4
        telecommunications: 4
        transportation networks: 3
      conditional_distributions: {}
      property_name: Domain
      property_values:
        - e-commerce
        - healthcare analytics
        - supply chain logistics
        - education technology
        - financial trading
        - renewable energy
        - telecommunications
        - media streaming
        - smart cities
        - agricultural technology
        - cybersecurity operations
        - pharmaceutical research
        - aviation operations
        - maritime logistics
        - construction management
        - retail analytics
        - insurance operations
        - nonprofit fundraising
        - government services
        - biotechnology research
        - manufacturing operations
        - hospitality management
        - sports analytics
        - environmental monitoring
        - transportation networks
        - real estate management
        - food safety
        - mining operations
        - defense systems
        - space exploration
    - base_distributions:
        CTEs: 8
        aggregation: 15
        basic SQL: 10
        multiple joins: 20
        set operations: 2
        single join: 15
        subqueries: 18
        window functions: 12
      conditional_distributions: {}
      property_name: SQL Complexity
      property_values:
        - basic SQL
        - aggregation
        - single join
        - multiple joins
        - subqueries
        - window functions
        - CTEs
        - set operations
    - base_distributions:
        '1': 30
        '2': 35
        '3': 20
        '4': 10
        '5': 5
      conditional_distributions:
        SQL Complexity:
          CTEs:
            '1': 5
            '2': 25
            '3': 40
            '4': 20
            '5': 10
          aggregation:
            '1': 70
            '2': 20
            '3': 8
            '4': 2
            '5': 0
          basic SQL:
            '1': 80
            '2': 15
            '3': 5
            '4': 0
            '5': 0
          multiple joins:
            '1': 0
            '2': 0
            '3': 45
            '4': 40
            '5': 15
          set operations:
            '1': 0
            '2': 40
            '3': 35
            '4': 20
            '5': 5
          single join:
            '1': 0
            '2': 70
            '3': 25
            '4': 5
            '5': 0
          subqueries:
            '1': 30
            '2': 35
            '3': 25
            '4': 8
            '5': 2
          window functions:
            '1': 40
            '2': 30
            '3': 20
            '4': 8
            '5': 2
      property_name: Number of Tables
      property_values:
        - '1'
        - '2'
        - '3'
        - '4'
        - '5'
    - base_distributions:
        cross join: 2
        inner join: 35
        left join: 10
        multiple join types: 5
        not applicable: 45
        right join: 3
      conditional_distributions:
        Number of Tables:
          '1':
            cross join: 0
            inner join: 0
            left join: 0
            multiple join types: 0
            not applicable: 100
            right join: 0
          '2':
            cross join: 3
            inner join: 70
            left join: 18
            multiple join types: 3
            not applicable: 0
            right join: 6
          '3':
            cross join: 4
            inner join: 55
            left join: 25
            multiple join types: 8
            not applicable: 0
            right join: 8
          '4':
            cross join: 5
            inner join: 45
            left join: 30
            multiple join types: 10
            not applicable: 0
            right join: 10
          '5':
            cross join: 6
            inner join: 40
            left join: 30
            multiple join types: 12
            not applicable: 0
            right join: 12
        SQL Complexity:
          CTEs:
            cross join: 3
            inner join: 50
            left join: 15
            multiple join types: 5
            not applicable: 20
            right join: 7
          aggregation:
            cross join: 0
            inner join: 0
            left join: 0
            multiple join types: 0
            not applicable: 100
            right join: 0
          basic SQL:
            cross join: 0
            inner join: 0
            left join: 0
            multiple join types: 0
            not applicable: 100
            right join: 0
          multiple joins:
            cross join: 4
            inner join: 60
            left join: 20
            multiple join types: 8
            not applicable: 0
            right join: 8
          set operations:
            cross join: 5
            inner join: 40
            left join: 15
            multiple join types: 2
            not applicable: 30
            right join: 8
          single join:
            cross join: 3
            inner join: 75
            left join: 15
            multiple join types: 2
            not applicable: 0
            right join: 5
          subqueries:
            cross join: 2
            inner join: 25
            left join: 8
            multiple join types: 2
            not applicable: 60
            right join: 3
          window functions:
            cross join: 1
            inner join: 18
            left join: 7
            multiple join types: 1
            not applicable: 70
            right join: 3
      property_name: Join Type
      property_values:
        - not applicable
        - inner join
        - left join
        - right join
        - cross join
        - multiple join types
    - base_distributions:
        analytics and reporting: 75
        data definition: 3
        data manipulation: 15
        data retrieval: 7
      conditional_distributions: {}
      property_name: SQL Task Type
      property_values:
        - analytics and reporting
        - data manipulation
        - data retrieval
        - data definition
    - base_distributions:
        complex multi-part query: 12
        conversational request: 15
        direct question: 35
        imperative command: 25
        informal with vague terms: 5
        terse keyword-style: 8
      conditional_distributions: {}
      property_name: Prompt Style
      property_values:
        - direct question
        - imperative command
        - conversational request
        - terse keyword-style
        - complex multi-part query
        - informal with vague terms
    - base_distributions:
        2-3: 15
        4-5: 45
        6-7: 30
        8-10: 10
      conditional_distributions: {}
      property_name: Average Columns per Table
      property_values:
        - 2-3
        - 4-5
        - 6-7
        - 8-10
    - base_distributions:
        'no': 55
        'yes': 45
      conditional_distributions:
        Number of Tables:
          '1':
            'no': 100
            'yes': 0
          '2':
            'no': 50
            'yes': 50
          '3':
            'no': 35
            'yes': 65
          '4':
            'no': 25
            'yes': 75
          '5':
            'no': 15
            'yes': 85
      property_name: Uses Foreign Keys
      property_values:
        - 'yes'
        - 'no'
    - base_distributions:
        'no': 60
        'yes': 40
      conditional_distributions: {}
      property_name: Uses Primary Keys
      property_values:
        - 'yes'
        - 'no'
    - base_distributions:
        AVG: 10
        COUNT: 15
        GROUP BY with HAVING: 5
        MIN or MAX: 8
        SUM: 12
        multiple aggregation functions: 15
        none: 35
      conditional_distributions:
        SQL Complexity:
          CTEs:
            AVG: 12
            COUNT: 15
            GROUP BY with HAVING: 3
            MIN or MAX: 10
            SUM: 15
            multiple aggregation functions: 15
            none: 30
          aggregation:
            AVG: 18
            COUNT: 25
            GROUP BY with HAVING: 5
            MIN or MAX: 12
            SUM: 20
            multiple aggregation functions: 20
            none: 0
          basic SQL:
            AVG: 3
            COUNT: 8
            GROUP BY with HAVING: 0
            MIN or MAX: 2
            SUM: 6
            multiple aggregation functions: 1
            none: 80
          multiple joins:
            AVG: 10
            COUNT: 18
            GROUP BY with HAVING: 2
            MIN or MAX: 8
            SUM: 15
            multiple aggregation functions: 12
            none: 35
          set operations:
            AVG: 8
            COUNT: 12
            GROUP BY with HAVING: 2
            MIN or MAX: 8
            SUM: 10
            multiple aggregation functions: 10
            none: 50
          single join:
            AVG: 10
            COUNT: 15
            GROUP BY with HAVING: 3
            MIN or MAX: 8
            SUM: 12
            multiple aggregation functions: 12
            none: 40
          subqueries:
            AVG: 10
            COUNT: 12
            GROUP BY with HAVING: 3
            MIN or MAX: 8
            SUM: 10
            multiple aggregation functions: 12
            none: 45
          window functions:
            AVG: 15
            COUNT: 15
            GROUP BY with HAVING: 3
            MIN or MAX: 10
            SUM: 12
            multiple aggregation functions: 15
            none: 30
      property_name: Aggregation Functions Used
      property_values:
        - none
        - COUNT
        - SUM
        - AVG
        - MIN or MAX
        - multiple aggregation functions
        - GROUP BY with HAVING
    - base_distributions:
        LAG or LEAD: 4
        NTILE or PERCENT_RANK: 1
        RANK or DENSE_RANK: 5
        ROW_NUMBER: 6
        aggregation with OVER: 4
        not applicable: 80
      conditional_distributions:
        SQL Complexity:
          CTEs:
            LAG or LEAD: 3
            NTILE or PERCENT_RANK: 1
            RANK or DENSE_RANK: 4
            ROW_NUMBER: 5
            aggregation with OVER: 2
            not applicable: 85
          aggregation:
            LAG or LEAD: 0
            NTILE or PERCENT_RANK: 0
            RANK or DENSE_RANK: 0
            ROW_NUMBER: 0
            aggregation with OVER: 0
            not applicable: 100
          basic SQL:
            LAG or LEAD: 0
            NTILE or PERCENT_RANK: 0
            RANK or DENSE_RANK: 0
            ROW_NUMBER: 0
            aggregation with OVER: 0
            not applicable: 100
          multiple joins:
            LAG or LEAD: 0
            NTILE or PERCENT_RANK: 0
            RANK or DENSE_RANK: 0
            ROW_NUMBER: 0
            aggregation with OVER: 0
            not applicable: 100
          set operations:
            LAG or LEAD: 0
            NTILE or PERCENT_RANK: 0
            RANK or DENSE_RANK: 0
            ROW_NUMBER: 0
            aggregation with OVER: 0
            not applicable: 100
          single join:
            LAG or LEAD: 0
            NTILE or PERCENT_RANK: 0
            RANK or DENSE_RANK: 0
            ROW_NUMBER: 0
            aggregation with OVER: 0
            not applicable: 100
          subqueries:
            LAG or LEAD: 0
            NTILE or PERCENT_RANK: 0
            RANK or DENSE_RANK: 0
            ROW_NUMBER: 0
            aggregation with OVER: 0
            not applicable: 100
          window functions:
            LAG or LEAD: 20
            NTILE or PERCENT_RANK: 5
            RANK or DENSE_RANK: 25
            ROW_NUMBER: 30
            aggregation with OVER: 20
            not applicable: 0
      property_name: Window Function Type
      property_values:
        - not applicable
        - ROW_NUMBER
        - RANK or DENSE_RANK
        - LAG or LEAD
        - aggregation with OVER
        - NTILE or PERCENT_RANK
    - base_distributions:
        EXCEPT: 1
        INTERSECT: 1
        UNION: 2
        UNION ALL: 1
        not applicable: 95
      conditional_distributions:
        SQL Complexity:
          CTEs:
            EXCEPT: 1
            INTERSECT: 1
            UNION: 2
            UNION ALL: 1
            not applicable: 95
          aggregation:
            EXCEPT: 0
            INTERSECT: 0
            UNION: 0
            UNION ALL: 0
            not applicable: 100
          basic SQL:
            EXCEPT: 0
            INTERSECT: 0
            UNION: 0
            UNION ALL: 0
            not applicable: 100
          multiple joins:
            EXCEPT: 0
            INTERSECT: 0
            UNION: 0
            UNION ALL: 0
            not applicable: 100
          set operations:
            EXCEPT: 10
            INTERSECT: 20
            UNION: 40
            UNION ALL: 30
            not applicable: 0
          single join:
            EXCEPT: 0
            INTERSECT: 0
            UNION: 0
            UNION ALL: 0
            not applicable: 100
          subqueries:
            EXCEPT: 0
            INTERSECT: 0
            UNION: 0
            UNION ALL: 0
            not applicable: 100
          window functions:
            EXCEPT: 0
            INTERSECT: 0
            UNION: 0
            UNION ALL: 0
            not applicable: 100
      property_name: Set Operation Type
      property_values:
        - not applicable
        - UNION
        - UNION ALL
        - INTERSECT
        - EXCEPT
    - base_distributions:
        EXISTS or NOT EXISTS: 3
        IN or NOT IN subquery: 8
        correlated subquery: 4
        multiple subqueries: 2
        not applicable: 75
        scalar subquery in WHERE: 6
        subquery in FROM clause: 2
      conditional_distributions:
        SQL Complexity:
          CTEs:
            EXISTS or NOT EXISTS: 1
            IN or NOT IN subquery: 3
            correlated subquery: 2
            multiple subqueries: 0
            not applicable: 90
            scalar subquery in WHERE: 3
            subquery in FROM clause: 1
          aggregation:
            EXISTS or NOT EXISTS: 0
            IN or NOT IN subquery: 0
            correlated subquery: 0
            multiple subqueries: 0
            not applicable: 100
            scalar subquery in WHERE: 0
            subquery in FROM clause: 0
          basic SQL:
            EXISTS or NOT EXISTS: 0
            IN or NOT IN subquery: 0
            correlated subquery: 0
            multiple subqueries: 0
            not applicable: 100
            scalar subquery in WHERE: 0
            subquery in FROM clause: 0
          multiple joins:
            EXISTS or NOT EXISTS: 0
            IN or NOT IN subquery: 0
            correlated subquery: 0
            multiple subqueries: 0
            not applicable: 100
            scalar subquery in WHERE: 0
            subquery in FROM clause: 0
          set operations:
            EXISTS or NOT EXISTS: 0
            IN or NOT IN subquery: 0
            correlated subquery: 0
            multiple subqueries: 0
            not applicable: 100
            scalar subquery in WHERE: 0
            subquery in FROM clause: 0
          single join:
            EXISTS or NOT EXISTS: 0
            IN or NOT IN subquery: 0
            correlated subquery: 0
            multiple subqueries: 0
            not applicable: 100
            scalar subquery in WHERE: 0
            subquery in FROM clause: 0
          subqueries:
            EXISTS or NOT EXISTS: 12
            IN or NOT IN subquery: 30
            correlated subquery: 18
            multiple subqueries: 5
            not applicable: 0
            scalar subquery in WHERE: 25
            subquery in FROM clause: 10
          window functions:
            EXISTS or NOT EXISTS: 0
            IN or NOT IN subquery: 0
            correlated subquery: 0
            multiple subqueries: 0
            not applicable: 100
            scalar subquery in WHERE: 0
            subquery in FROM clause: 0
      property_name: Subquery Type
      property_values:
        - not applicable
        - scalar subquery in WHERE
        - IN or NOT IN subquery
        - EXISTS or NOT EXISTS
        - correlated subquery
        - subquery in FROM clause
        - multiple subqueries
    - base_distributions:
        multiple CTEs: 4
        not applicable: 90
        recursive CTE: 1
        single CTE: 5
      conditional_distributions:
        SQL Complexity:
          CTEs:
            multiple CTEs: 35
            not applicable: 0
            recursive CTE: 5
            single CTE: 60
          aggregation:
            multiple CTEs: 0
            not applicable: 100
            recursive CTE: 0
            single CTE: 0
          basic SQL:
            multiple CTEs: 0
            not applicable: 100
            recursive CTE: 0
            single CTE: 0
          multiple joins:
            multiple CTEs: 0
            not applicable: 100
            recursive CTE: 0
            single CTE: 0
          set operations:
            multiple CTEs: 0
            not applicable: 100
            recursive CTE: 0
            single CTE: 0
          single join:
            multiple CTEs: 0
            not applicable: 100
            recursive CTE: 0
            single CTE: 0
          subqueries:
            multiple CTEs: 0
            not applicable: 100
            recursive CTE: 0
            single CTE: 0
          window functions:
            multiple CTEs: 0
            not applicable: 100
            recursive CTE: 0
            single CTE: 0
      property_name: CTE Usage
      property_values:
        - not applicable
        - single CTE
        - multiple CTEs
        - recursive CTE
    - base_distributions:
        date arithmetic: 12
        date extraction: 8
        date filtering: 20
        multiple date functions: 5
        none: 55
      conditional_distributions: {}
      property_name: Date/Time Functions Used
      property_values:
        - none
        - date filtering
        - date arithmetic
        - date extraction
        - multiple date functions
    - base_distributions:
        LIKE or pattern matching: 12
        UPPER/LOWER case: 4
        concatenation: 6
        multiple string functions: 4
        none: 70
        substring operations: 4
      conditional_distributions: {}
      property_name: String Functions Used
      property_values:
        - none
        - LIKE or pattern matching
        - concatenation
        - substring operations
        - UPPER/LOWER case
        - multiple string functions
    - base_distributions:
        IN or BETWEEN operators: 15
        complex with AND/OR/NOT: 15
        multiple conditions with AND: 25
        multiple conditions with OR: 10
        no WHERE clause: 8
        simple equality: 15
        subquery in WHERE: 12
      conditional_distributions: {}
      property_name: WHERE Clause Complexity
      property_values:
        - no WHERE clause
        - simple equality
        - multiple conditions with AND
        - multiple conditions with OR
        - complex with AND/OR/NOT
        - IN or BETWEEN operators
        - subquery in WHERE
    - base_distributions:
        multiple columns: 15
        not used: 45
        ordering by aggregation: 10
        single column ASC: 10
        single column DESC: 20
      conditional_distributions: {}
      property_name: ORDER BY Usage
      property_values:
        - not used
        - single column ASC
        - single column DESC
        - multiple columns
        - ordering by aggregation
    - base_distributions:
        LIMIT with larger number: 10
        LIMIT with small number (1-10): 18
        TOP clause: 2
        not used: 70
      conditional_distributions: {}
      property_name: LIMIT or TOP Usage
      property_values:
        - not used
        - LIMIT with small number (1-10)
        - LIMIT with larger number
        - TOP clause
    - base_distributions:
        COUNT DISTINCT: 13
        DISTINCT on multiple columns: 5
        DISTINCT on single column: 12
        not used: 70
      conditional_distributions: {}
      property_name: DISTINCT Usage
      property_values:
        - not used
        - DISTINCT on single column
        - DISTINCT on multiple columns
        - COUNT DISTINCT
    - base_distributions:
        COALESCE or IFNULL: 5
        IS NULL or IS NOT NULL: 15
        NULL in aggregation: 3
        NULLIF: 2
        not applicable: 75
      conditional_distributions: {}
      property_name: NULL Handling
      property_values:
        - not applicable
        - IS NULL or IS NOT NULL
        - COALESCE or IFNULL
        - NULLIF
        - NULL in aggregation
    - base_distributions:
        CASE in SELECT: 7
        CASE in aggregation: 4
        nested CASE: 1
        not used: 80
        simple CASE: 8
      conditional_distributions: {}
      property_name: CASE Statement Usage
      property_values:
        - not used
        - simple CASE
        - CASE in SELECT
        - CASE in aggregation
        - nested CASE
    - base_distributions:
        diverse types (4+ different types): 15
        includes BOOLEAN: 15
        includes DATE or TIMESTAMP: 30
        includes DECIMAL or FLOAT: 25
        integers and strings only: 15
      conditional_distributions: {}
      property_name: Data Types in Schema
      property_values:
        - integers and strings only
        - includes DATE or TIMESTAMP
        - includes DECIMAL or FLOAT
        - includes BOOLEAN
        - diverse types (4+ different types)
    - base_distributions:
        minimal (1-3 rows per table): 45
        moderate (11-30 rows per table): 10
        small (4-10 rows per table): 35
        varied across tables: 10
      conditional_distributions: {}
      property_name: Sample Data Volume
      property_values:
        - minimal (1-3 rows per table)
        - small (4-10 rows per table)
        - moderate (11-30 rows per table)
        - varied across tables
    - base_distributions:
        aggregated summary: 25
        multiple rows and columns: 40
        multiple rows single column: 15
        single row: 12
        single value: 8
      conditional_distributions: {}
      property_name: Query Result Scope
      property_values:
        - single value
        - single row
        - multiple rows single column
        - multiple rows and columns
        - aggregated summary
    - base_distributions:
        brief one-sentence: 20
        detailed step-by-step: 30
        moderate with key operations: 40
        technical with function names: 10
      conditional_distributions: {}
      property_name: Explanation Detail Level
      property_values:
        - brief one-sentence
        - detailed step-by-step
        - moderate with key operations
        - technical with function names
    - base_distributions:
        consistent schema prefix: 2
        no schema prefix: 90
        occasional schema prefix: 8
      conditional_distributions: {}
      property_name: Uses Schema Qualification
      property_values:
        - no schema prefix
        - occasional schema prefix
        - consistent schema prefix
    - base_distributions:
        meaningful aliases: 10
        mixed alias styles: 2
        no aliases: 25
        not applicable: 55
        single-letter aliases: 8
      conditional_distributions:
        Number of Tables:
          '1':
            meaningful aliases: 0
            mixed alias styles: 0
            no aliases: 0
            not applicable: 100
            single-letter aliases: 0
          '2':
            meaningful aliases: 35
            mixed alias styles: 5
            no aliases: 40
            not applicable: 0
            single-letter aliases: 20
          '3':
            meaningful aliases: 45
            mixed alias styles: 10
            no aliases: 20
            not applicable: 0
            single-letter aliases: 25
          '4':
            meaningful aliases: 45
            mixed alias styles: 10
            no aliases: 15
            not applicable: 0
            single-letter aliases: 30
          '5':
            meaningful aliases: 50
            mixed alias styles: 10
            no aliases: 10
            not applicable: 0
            single-letter aliases: 30
      property_name: Uses Table Aliases
      property_values:
        - not applicable
        - no aliases
        - single-letter aliases
        - meaningful aliases
        - mixed alias styles
  selected_sql_schema_column: 7
  selected_sql_query_columns:
    - 8
  createdAt: '2025-11-25'
  updatedAt: '2025-11-25'
