2
0
mirror of https://github.com/frappe/frappe.git synced 2024-06-13 09:12:21 +00:00

feat(recorder): Suggest query index

This commit is contained in:
Ankush Menat 2024-05-13 19:39:32 +05:30
parent 49f8f4fe45
commit 4e251e9b0b
8 changed files with 558 additions and 74 deletions

View File

@ -1,72 +0,0 @@
context.skip("Recorder", () => {
before(() => {
cy.login();
});
beforeEach(() => {
cy.visit("/app/recorder");
return cy
.window()
.its("frappe")
.then((frappe) => {
// reset recorder
return frappe.xcall("frappe.recorder.stop").then(() => {
return frappe.xcall("frappe.recorder.delete");
});
});
});
it("Recorder Empty State", () => {
cy.get(".page-head").findByTitle("Recorder").should("exist");
cy.get(".indicator-pill").should("contain", "Inactive").should("have.class", "red");
cy.get(".page-actions").findByRole("button", { name: "Start" }).should("exist");
cy.get(".page-actions").findByRole("button", { name: "Clear" }).should("exist");
cy.get(".msg-box").should("contain", "Recorder is Inactive");
cy.get(".msg-box").findByRole("button", { name: "Start Recording" }).should("exist");
});
it("Recorder Start", () => {
cy.get(".page-actions").findByRole("button", { name: "Start" }).click();
cy.get(".indicator-pill").should("contain", "Active").should("have.class", "green");
cy.get(".msg-box").should("contain", "No Requests found");
cy.visit("/app/List/DocType/List");
cy.intercept("POST", "/api/method/frappe.desk.reportview.get").as("list_refresh");
cy.wait("@list_refresh");
cy.get(".page-head").findByTitle("DocType").should("exist");
cy.get(".list-count").should("contain", "20 of ");
cy.visit("/app/recorder");
cy.get(".page-head").findByTitle("Recorder").should("exist");
cy.get(".frappe-list .result-list").should(
"contain",
"/api/method/frappe.desk.reportview.get"
);
});
it("Recorder View Request", () => {
cy.get(".page-actions").findByRole("button", { name: "Start" }).click();
cy.visit("/app/List/DocType/List");
cy.intercept("POST", "/api/method/frappe.desk.reportview.get").as("list_refresh");
cy.wait("@list_refresh");
cy.get(".page-head").findByTitle("DocType").should("exist");
cy.get(".list-count").should("contain", "20 of ");
cy.visit("/app/recorder");
cy.get(".frappe-list .list-row-container span")
.contains("/api/method/frappe")
.should("be.visible")
.click({ force: true });
cy.url().should("include", "/recorder/request");
cy.get("form").should("contain", "/api/method/frappe");
});
});

View File

@ -0,0 +1,283 @@
"""Basic DB optimizer for Frappe Framework based app.
This is largely based on heuristics and known good practices for indexing.
"""
from collections import defaultdict
from dataclasses import dataclass
from typing import TypeVar
from sql_metadata import Parser
import frappe
from frappe.utils import flt
# Any index that reads more than 30% table on average is not "useful"
INDEX_SCORE_THRESHOLD = 0.3
# Anything reading less than this percent of table is considered optimal
OPTIMIZATION_THRESHOLD = 0.1
T = TypeVar("T")
@dataclass
class DBColumn:
name: str
cardinality: int | None
is_nullable: bool
default: str
data_type: str
@classmethod
def from_frappe_ouput(cls, data) -> "DBColumn":
"Parse DBColumn from output of describe-database-table command in Frappe"
return cls(
name=data["column"],
cardinality=data.get("cardinality"),
is_nullable=data["is_nullable"],
default=data["default"],
data_type=data["type"],
)
@dataclass
class DBIndex:
name: str
column: str
table: str
unique: bool | None = None
cardinality: int | None = None
sequence: int = 1
nullable: bool = True
_score: float = 0.0
def __eq__(self, other: "DBIndex") -> bool:
return self.column == other.column and self.sequence == other.sequence and self.table == other.table
def __repr__(self):
return f"DBIndex(`{self.table}`.`{self.column}`)"
@classmethod
def from_frappe_ouput(cls, data, table) -> "DBIndex":
"Parse DBIndex from output of describe-database-table command in Frappe"
return cls(
name=data["name"],
table=table,
unique=data["unique"],
cardinality=data["cardinality"],
sequence=data["sequence"],
nullable=data["nullable"],
column=data["column"],
)
@dataclass
class ColumnStat:
column_name: str
avg_frequency: float
avg_length: float
nulls_ratio: float | None = None
histogram: list[float] = None
def __post_init__(self):
if not self.histogram:
self.histogram = []
@classmethod
def from_frappe_ouput(cls, data) -> "ColumnStat":
return cls(
column_name=data["column_name"],
avg_frequency=data["avg_frequency"],
avg_length=data["avg_length"],
nulls_ratio=data["nulls_ratio"],
histogram=[flt(bin) for bin in data["histogram"].split(",")] if data["histogram"] else [],
)
@dataclass
class DBTable:
name: str
total_rows: int
schema: list[DBColumn] | None = None
indexes: list[DBIndex] | None = None
def __post_init__(self):
if not self.schema:
self.schema = []
if not self.indexes:
self.indexes = []
def update_cardinality(self, column_stats: list[ColumnStat]) -> None:
"""Estimate cardinality using mysql.column_stat"""
for column_stat in column_stats:
for col in self.schema:
if col.name == column_stat.column_name and not col.cardinality and column_stat.avg_frequency:
# "hack" or "math" - average frequency is on average how frequently a row value appears.
# Avg = total_rows / cardinality, so...
col.cardinality = self.total_rows / column_stat.avg_frequency
@classmethod
def from_frappe_ouput(cls, data) -> "DBTable":
"Parse DBTable from output of describe-database-table command in Frappe"
table_name = data["table_name"]
return cls(
name=table_name,
total_rows=data["total_rows"],
schema=[DBColumn.from_frappe_ouput(c) for c in data["schema"]],
indexes=[DBIndex.from_frappe_ouput(i, table_name) for i in data["indexes"]],
)
def has_column(self, column: str) -> bool:
for col in self.schema:
if col.name == column:
return True
return False
@dataclass
class DBOptimizer:
query: str # raw query in string format
tables: dict[str, DBTable] = None
parsed_query: Parser = None
def __post_init__(self):
if not self.tables:
self.tables = {}
self.parsed_query = Parser(self.query)
def tables_examined(self) -> list[str]:
return self.parsed_query.tables
def update_table_data(self, table: DBTable):
self.tables[table.name] = table
def _convert_to_db_index(self, column: str) -> DBIndex:
column_name, table = None, None
if "." in column:
table, column_name = column.split(".")
else:
column_name = column
for table_name, db_table in self.tables.items():
if db_table.has_column(column):
table = table_name
break
return DBIndex(column=column_name, name=column_name, table=table)
def _remove_existing_indexes(self, potential_indexes: list[DBIndex]) -> list[DBIndex]:
"""Given list of potential index candidates remove the ones that already exist.
This also removes multi-column indexes for parts that are applicable to query.
Example: If multi-col index A+B+C exists and query utilizes A+B then
A+B are removed from potential indexes.
"""
def remove_maximum_indexes(idx: list[DBIndex]):
"""Try to remove entire index from potential indexes, if not possible, reduce one part and try again until no parts are left."""
if not idx:
return
matched_sub_index = []
for idx_part in list(idx):
matching_part = [
i for i in potential_indexes if i.column == idx_part.column and i.table == idx_part.table
]
if not matching_part:
# pop and recurse
idx.pop()
return remove_maximum_indexes(idx)
else:
matched_sub_index.extend(matching_part)
# Every part matched now, lets remove those parts
for i in matched_sub_index:
potential_indexes.remove(i)
# Reconstruct multi-col index
for table in self.tables.values():
merged_indexes = defaultdict(list)
for index in table.indexes:
merged_indexes[index.name].append(index)
for idx in merged_indexes.values():
idx.sort(key=lambda x: x.sequence)
for idx in merged_indexes.values():
remove_maximum_indexes(idx)
return potential_indexes
def potential_indexes(self) -> list[DBIndex]:
"""Get all columns that can potentially be indexed to speed up this query."""
possible_indexes = []
# Where claus columns using these operators benefit from index
# 1. = (equality)
# 2. >, <, >=, <=
# 3. LIKE 'xyz%' (Prefix search)
# 4. BETWEEN (for date[time] fields)
# 5. IN (similar to equality)
if not self.parsed_query.columns_dict:
return []
if where_columns := self.parsed_query.columns_dict.get("where"):
# TODO: Apply some heuristics here, not all columns in where clause are actually useful
possible_indexes.extend(where_columns)
# Join clauses - Both sides of join should ideally be indexed. One will *usually* be primary key.
if join_columns := self.parsed_query.columns_dict.get("join"):
possible_indexes.extend(join_columns)
# Top N query variant - Order by column can possibly speed up the query
if order_by_columns := self.parsed_query.columns_dict.get("order_by"):
if self.parsed_query.limit_and_offset:
possible_indexes.extend(order_by_columns)
possible_db_indexes = [self._convert_to_db_index(i) for i in possible_indexes]
possible_db_indexes = [i for i in possible_db_indexes if i.column not in ("*", "name")]
possible_db_indexes.sort(key=lambda i: (i.table, i.column))
return self._remove_existing_indexes(possible_db_indexes)
def suggest_index(self) -> DBIndex | None:
"""Suggest best possible column to index given query and table stats."""
if missing_tables := (set(self.tables_examined()) - set(self.tables.keys())):
frappe.throw("DBTable infomation missing for: " + ", ".join(missing_tables))
potential_indexes = self.potential_indexes()
for index in list(potential_indexes):
table = self.tables[index.table]
# Data type is not easily indexable - skip
column = next(c for c in table.schema if c.name == index.column)
if "text" in column.data_type.lower() or "json" in column.data_type.lower():
potential_indexes.remove(index)
# Update cardinality from column so scoring can be done
index.cardinality = column.cardinality
for index in potential_indexes:
index._score = self.index_score(index)
potential_indexes.sort(key=lambda i: i._score)
if (
potential_indexes
and (best_index := potential_indexes[0])
and best_index._score < INDEX_SCORE_THRESHOLD
):
return best_index
def index_score(self, index: DBIndex) -> float:
"""Score an index from 0 to 1 based on usefulness.
A score of 0.5 indicates on average this index will read 50% of the table. (e.g. checkboxes)"""
table = self.tables[index.table]
cardinality = index.cardinality or 2
total_rows = table.total_rows or cardinality or 1
# We assume most unique values are evenly distributed, this is
# definitely not the case IRL but it should be good enough assumptions
# Score is rouhgly what percentage of table we will end up reading on typical query
rows_fetched_on_average = (table.total_rows or cardinality) / cardinality
return rows_fetched_on_average / total_rows

View File

@ -12,6 +12,16 @@ frappe.ui.form.on("Recorder", {
frm.fields_dict.sql_queries.grid.grid_pagination.page_length = 500;
refresh_field("sql_queries");
frm.trigger("format_grid");
frm.add_custom_button(__("Suggest Optimizations"), () => {
frappe.xcall("frappe.core.doctype.recorder.recorder.optimize", {
recorder_id: frm.doc.name,
});
});
frappe.realtime.on("recorder-analysis-complete", () => {
frm.reload_doc();
setTimeout(() => frm.scroll_to_field("suggested_indexes"), 1500);
});
},
setup_sort: function (frm) {
@ -25,6 +35,7 @@ frappe.ui.form.on("Recorder", {
frm._sort_order[field] = -1 * sort_order; // reverse for next click
grid.refresh();
frm.trigger("setup_sort"); // grid creates new elements again, resetup listeners.
frm.trigger("format_grid");
});
});
},

View File

@ -20,6 +20,7 @@
"section_break_sgro",
"form_dict",
"section_break_9jhm",
"suggested_indexes",
"sql_queries",
"section_break_optn",
"profile"
@ -119,6 +120,13 @@
"fieldtype": "Code",
"label": "cProfile Output",
"read_only": 1
},
{
"description": "Disclaimer: These indexes are suggested based on data and queries performed during this recording. These suggestions may or may not help.",
"fieldname": "suggested_indexes",
"fieldtype": "Table",
"label": "Suggested Indexes",
"options": "Recorder Suggested Index"
}
],
"hide_toolbar": 1,
@ -126,7 +134,7 @@
"index_web_pages_for_search": 1,
"is_virtual": 1,
"links": [],
"modified": "2024-02-01 22:13:26.505174",
"modified": "2024-05-14 15:16:55.626656",
"modified_by": "Administrator",
"module": "Core",
"name": "Recorder",

View File

@ -1,10 +1,16 @@
# Copyright (c) 2023, Frappe Technologies and contributors
# For license information, please see license.txt
from collections import Counter, defaultdict
import frappe
from frappe import _
from frappe.core.doctype.recorder.db_optimizer import DBOptimizer, DBTable
from frappe.model.document import Document
from frappe.recorder import RECORDER_REQUEST_HASH
from frappe.recorder import get as get_recorder_data
from frappe.utils import cint, evaluate_filters
from frappe.utils import cint, cstr, evaluate_filters, get_table_name
from frappe.utils.caching import redis_cache
class Recorder(Document):
@ -15,6 +21,9 @@ class Recorder(Document):
if TYPE_CHECKING:
from frappe.core.doctype.recorder_query.recorder_query import RecorderQuery
from frappe.core.doctype.recorder_suggested_index.recorder_suggested_index import (
RecorderSuggestedIndex,
)
from frappe.types import DF
cmd: DF.Data | None
@ -27,6 +36,7 @@ class Recorder(Document):
profile: DF.Code | None
request_headers: DF.Code | None
sql_queries: DF.Table[RecorderQuery]
suggested_indexes: DF.Table[RecorderSuggestedIndex]
time: DF.Datetime | None
time_in_queries: DF.Float
# end: auto-generated types
@ -95,8 +105,160 @@ def serialize_request(request):
request_headers=frappe.as_json(request.get("headers"), indent=4),
form_dict=frappe.as_json(request.get("form_dict"), indent=4),
sql_queries=request.get("calls"),
suggested_indexes=request.get("suggested_indexes"),
modified=request.get("time"),
creation=request.get("time"),
)
return request
@frappe.whitelist()
def optimize(recorder_id: str):
frappe.only_for("Administrator")
frappe.enqueue(_optimize, recorder_id=recorder_id, queue="long")
def _optimize(recorder_id):
record: Recorder = frappe.get_doc("Recorder", recorder_id)
total_duration = record.time_in_queries
# Any index with query time less than 5% of total time is not suggested
PERCENT_DURATION_THRESHOLD_OVERALL = 0.05
# Any query with duration less than 0.5% of total duration is not analyzed
PERCENT_DURATION_THRESHOLD_QUERY = 0.005
# Index suggestion -> Query duration
index_suggestions = Counter()
for idx, captured_query in enumerate(record.sql_queries, start=1):
query = cstr(captured_query.query)
frappe.publish_progress(
idx / len(record.sql_queries) * 100,
title="Analyzing Queries",
doctype=record.doctype,
docname=record.name,
description=f"Analyzing query: {query[:140]}",
)
if captured_query.duration < total_duration * PERCENT_DURATION_THRESHOLD_QUERY:
continue
if not query.lower().strip().startswith(("select", "update", "delete")):
continue
if index := _optimize_query(query):
index_suggestions[(index.table, index.column)] += captured_query.duration
suggested_indexes = index_suggestions.most_common(3)
suggested_indexes = [
idx for idx in suggested_indexes if idx[1] > total_duration * PERCENT_DURATION_THRESHOLD_OVERALL
]
if not suggested_indexes:
frappe.msgprint(_("No optimization suggestions."), realtime=True)
return
frappe.msgprint(_("Query analysis complete. Check suggested indexes."), realtime=True, alert=True)
data = frappe.cache.hget(RECORDER_REQUEST_HASH, record.name)
data["suggested_indexes"] = [{"table": idx[0][0], "column": idx[0][1]} for idx in suggested_indexes]
frappe.cache.hset(RECORDER_REQUEST_HASH, record.name, data)
frappe.publish_realtime("recorder-analysis-complete")
def _optimize_query(query):
optimizer = DBOptimizer(query=query)
tables = optimizer.tables_examined()
# Note: Two passes are required here because we first need basic data to understand which
# columns need to be analyzed to get accurate cardinality.
for table in tables:
doctype = get_doctype_name(table)
stats = _fetch_table_stats(doctype, columns=[])
if not stats:
return
db_table = DBTable.from_frappe_ouput(stats)
optimizer.update_table_data(db_table)
potential_indexes = optimizer.potential_indexes()
tablewise_columns = defaultdict(list)
for idx in potential_indexes:
tablewise_columns[idx.table].append(idx.column)
for table in tables:
doctype = get_doctype_name(table)
stats = _fetch_table_stats(doctype, columns=tablewise_columns[table])
if not stats:
return
db_table = DBTable.from_frappe_ouput(stats)
optimizer.update_table_data(db_table)
return optimizer.suggest_index()
def _fetch_table_stats(doctype: str, columns: list[str]) -> dict | None:
def sql_bool(val):
return cstr(val).lower() in ("yes", "1", "true")
if not frappe.db.table_exists(doctype):
return
table = get_table_name(doctype, wrap_in_backticks=True)
schema = []
for field in frappe.db.sql(f"describe {table}", as_dict=True):
schema.append(
{
"column": field["Field"],
"type": field["Type"],
"is_nullable": sql_bool(field["Null"]),
"default": field["Default"],
}
)
def update_cardinality(column, value):
for col in schema:
if col["column"] == column:
col["cardinality"] = value
break
indexes = []
for idx in frappe.db.sql(f"show index from {table}", as_dict=True):
indexes.append(
{
"unique": not sql_bool(idx["Non_unique"]),
"cardinality": idx["Cardinality"],
"name": idx["Key_name"],
"sequence": idx["Seq_in_index"],
"nullable": sql_bool(idx["Null"]),
"column": idx["Column_name"],
"type": idx["Index_type"],
}
)
if idx["Seq_in_index"] == 1:
update_cardinality(idx["Column_name"], idx["Cardinality"])
total_rows = cint(
frappe.db.sql(
f"""select table_rows
from information_schema.tables
where table_name = 'tab{doctype}'"""
)[0][0]
)
# fetch accurate cardinality for columns by query. WARN: This can take A LOT of time.
for column in columns:
cardinality = _get_column_cardinality(table, column)
update_cardinality(column, cardinality)
return {
"table_name": table.strip("`"),
"total_rows": total_rows,
"schema": schema,
"indexes": indexes,
}
@redis_cache
def _get_column_cardinality(table, column):
return frappe.db.sql(f"select count(distinct {column}) from {table}")[0][0]
def get_doctype_name(table_name: str) -> str:
return table_name.removeprefix("tab")

View File

@ -0,0 +1,46 @@
{
"actions": [],
"allow_rename": 1,
"creation": "2024-05-14 15:03:46.138438",
"doctype": "DocType",
"editable_grid": 1,
"engine": "InnoDB",
"field_order": [
"table",
"column",
"add_index"
],
"fields": [
{
"fieldname": "table",
"fieldtype": "Data",
"in_list_view": 1,
"label": "Table"
},
{
"fieldname": "column",
"fieldtype": "Data",
"in_list_view": 1,
"label": "Column"
},
{
"columns": 2,
"fieldname": "add_index",
"fieldtype": "Button",
"label": "Add Index"
}
],
"index_web_pages_for_search": 1,
"is_virtual": 1,
"istable": 1,
"links": [],
"modified": "2024-05-14 15:18:51.371808",
"modified_by": "Administrator",
"module": "Core",
"name": "Recorder Suggested Index",
"owner": "Administrator",
"permissions": [],
"sort_field": "creation",
"sort_order": "DESC",
"states": []
}

View File

@ -0,0 +1,46 @@
# Copyright (c) 2024, Frappe Technologies and contributors
# For license information, please see license.txt
# import frappe
from frappe.model.document import Document
class RecorderSuggestedIndex(Document):
# begin: auto-generated types
# This code is auto-generated. Do not modify anything in this block.
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from frappe.types import DF
column: DF.Data | None
parent: DF.Data
parentfield: DF.Data
parenttype: DF.Data
table: DF.Data | None
# end: auto-generated types
def db_insert(self, *args, **kwargs):
raise NotImplementedError
def load_from_db(self):
raise NotImplementedError
def db_update(self):
raise NotImplementedError
def delete(self):
raise NotImplementedError
@staticmethod
def get_list(filters=None, page_length=20, **kwargs):
pass
@staticmethod
def get_count(filters=None, **kwargs):
pass
@staticmethod
def get_stats(**kwargs):
pass