Improve handling of unrecoverable storage corruption (#96712)

* Improve handling of unrecoverable storage corruption

fixes #96574

If something in storage gets corrupted core can boot loop
or if its integration specific, the integration will fail to
start.

We now complainly loudly in the log, move away the corrupt data
and start fresh to allow startup to proceed so the user can
get to the UI and restore from backup without having to attach
a console (or otherwise login to the OS and manually modify files).

* test for corruption

* ensure OSError is still fatal

* one more case

* create an issue for corrupt storage

* fix key

* persist

* feedback

* feedback

* better to give the full path

* tweaks

* grammar

* add time

* feedback

* adjust

* try to get issue_domain from storage key

* coverage

* tweak wording some more
This commit is contained in:
J. Nick Koston
2023-07-19 02:23:12 -05:00
committed by GitHub
parent 3e58e1987c
commit 01e66d6fb2
3 changed files with 236 additions and 7 deletions

View File

@@ -6,15 +6,24 @@ from collections.abc import Callable, Mapping, Sequence
from contextlib import suppress
from copy import deepcopy
import inspect
from json import JSONEncoder
from json import JSONDecodeError, JSONEncoder
import logging
import os
from typing import Any, Generic, TypeVar
from homeassistant.const import EVENT_HOMEASSISTANT_FINAL_WRITE
from homeassistant.core import CALLBACK_TYPE, CoreState, Event, HomeAssistant, callback
from homeassistant.core import (
CALLBACK_TYPE,
DOMAIN as HOMEASSISTANT_DOMAIN,
CoreState,
Event,
HomeAssistant,
callback,
)
from homeassistant.exceptions import HomeAssistantError
from homeassistant.loader import MAX_LOAD_CONCURRENTLY, bind_hass
from homeassistant.util import json as json_util
import homeassistant.util.dt as dt_util
from homeassistant.util.file import WriteError
from . import json as json_helper
@@ -146,9 +155,63 @@ class Store(Generic[_T]):
# and we don't want that to mess with what we're trying to store.
data = deepcopy(data)
else:
data = await self.hass.async_add_executor_job(
json_util.load_json, self.path
)
try:
data = await self.hass.async_add_executor_job(
json_util.load_json, self.path
)
except HomeAssistantError as err:
if isinstance(err.__cause__, JSONDecodeError):
# If we have a JSONDecodeError, it means the file is corrupt.
# We can't recover from this, so we'll log an error, rename the file and
# return None so that we can start with a clean slate which will
# allow startup to continue so they can restore from a backup.
isotime = dt_util.utcnow().isoformat()
corrupt_postfix = f".corrupt.{isotime}"
corrupt_path = f"{self.path}{corrupt_postfix}"
await self.hass.async_add_executor_job(
os.rename, self.path, corrupt_path
)
storage_key = self.key
_LOGGER.error(
"Unrecoverable error decoding storage %s at %s; "
"This may indicate an unclean shutdown, invalid syntax "
"from manual edits, or disk corruption; "
"The corrupt file has been saved as %s; "
"It is recommended to restore from backup: %s",
storage_key,
self.path,
corrupt_path,
err,
)
from .issue_registry import ( # pylint: disable=import-outside-toplevel
IssueSeverity,
async_create_issue,
)
issue_domain = HOMEASSISTANT_DOMAIN
if (
domain := (storage_key.partition(".")[0])
) and domain in self.hass.config.components:
issue_domain = domain
async_create_issue(
self.hass,
HOMEASSISTANT_DOMAIN,
f"storage_corruption_{storage_key}_{isotime}",
is_fixable=True,
issue_domain=issue_domain,
translation_key="storage_corruption",
is_persistent=True,
severity=IssueSeverity.CRITICAL,
translation_placeholders={
"storage_key": storage_key,
"original_path": self.path,
"corrupt_path": corrupt_path,
"error": str(err),
},
)
return None
raise
if data == {}:
return None