Skip to content

WPB-17408 inconsistent database: unparseable emails #4578

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add tool to find users with unparseable emails
6 changes: 3 additions & 3 deletions tools/db/inconsistencies/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ spec:
restartPolicy: Never
containers:
- name: inconsistencies
image: <image-in-your-personal-docker-repo>
image: <image-with-tag>
imagePullPolicy: Always
args:
- handle-less-users # adjust to the command you need, see Options.hs
Expand All @@ -49,15 +49,15 @@ spec:
- --cassandra-keyspace-brig
- brig
- --inconsistencies-file
- /inconsistencies.log
- /tmp/inconsistencies.log
```

4. Wait for the process to finish. Watch logs, it will say something like "sleeping for 4 hours" and then close all connections to cassandra.

5. Copy the logs using `kubectl cp`

```
kubectl cp inconsistencies:/inconsistencies.log inconsistencies.log
kubectl cp inconsistencies:/tmp/inconsistencies.log inconsistencies.log
```

6. **IMPORTANT:** Delete the pod. The easiest way to do this is with `kubectl delete -f <filename>` (which also deletes any configmap)
Expand Down
2 changes: 2 additions & 0 deletions tools/db/inconsistencies/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
, bytestring
, cassandra-util
, conduit
, email-validate
, extended
, extra
, gitignoreSource
Expand All @@ -33,6 +34,7 @@ mkDerivation {
bytestring
cassandra-util
conduit
email-validate
extended
extra
imports
Expand Down
2 changes: 2 additions & 0 deletions tools/db/inconsistencies/inconsistencies.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ executable inconsistencies
DanglingHandles
DanglingUserKeys
EmailLessUsers
EmailUnparseableUsers
HandleLessUsers
Options
Paths_inconsistencies
Expand Down Expand Up @@ -74,6 +75,7 @@ executable inconsistencies
, bytestring
, cassandra-util
, conduit
, email-validate
, extended
, extra
, imports
Expand Down
113 changes: 113 additions & 0 deletions tools/db/inconsistencies/src/EmailUnparseableUsers.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
module EmailUnparseableUsers where

import Cassandra
import Cassandra.Util
import Conduit
import Data.Aeson (ToJSON, object, (.=))
import Data.Aeson qualified as Aeson
import Data.ByteString qualified as BS
import Data.Conduit.Internal (zipSources)
import Data.Conduit.List qualified as C
import Data.Id (UserId)
import Data.Text.Encoding qualified as TE
import Imports
import System.Logger (Logger)
import System.Logger qualified as Log
import Text.Email.Validate qualified as Email
import UnliftIO (pooledMapConcurrentlyN)
import Wire.API.User (AccountStatus)

-- import Wire.API.User.EmailAddress (EmailAddress)

-- Problem statement:
-- Upon brig re-index, we look up the user table and create a
-- IndexUser which contains a field of type EmailAddress
-- libs/wire-subsystems/src/Wire/UserStore/IndexUser.hs
--
-- Parsing this fails for some cases, most likely during the conversion of the cassandra type to EmailAddress.
--
-- In August 2024 emails were refactored in https://github.com/wireapp/wire-server/pull/4206. Possibly existing emails in the database were not checked whether they conform to the new library.
--
-- email parsing from cql/bytestring is done in
-- libs/wire-api/src/Wire/API/User/EmailAddress.hs
--
--
runCommand :: Logger -> ClientState -> FilePath -> IO ()
runCommand l brig inconsistenciesFile = do
runResourceT $
runConduit $
zipSources
(C.sourceList [(1 :: Int32) ..])
(transPipe (runClient brig) getUsers)
.| C.mapM
( \(i, userDetails) -> do
Log.info l (Log.field "userIds" (show ((i - 1) * pageSize + fromIntegral (length userDetails))))
pure $ mapMaybe toOffender userDetails
)
.| C.mapM (liftIO . pooledMapConcurrentlyN 48 checkOffender)
.| C.map ((<> "\n") . BS.intercalate "\n" . map (BS.toStrict . Aeson.encode) . catMaybes)
.| sinkFile inconsistenciesFile

-- | Parse and keep only rows whose email fails to validate.
toOffender :: UserDetailsRow -> Maybe Offender
toOffender row@(_, _, _, _, Just rawEmail) =
case Email.validate (TE.encodeUtf8 rawEmail) of
Left err -> Just (row, err) -- keep error message
Right _ -> Nothing -- good email → ignore
toOffender _ = Nothing -- no email stored

-- | Transform an offending row into JSON
checkOffender :: Offender -> IO (Maybe Aeson.Value)
checkOffender ((uid, activated, accStat, wt, Just rawEmail), err) =
pure . Just $
object
[ "userId" .= uid,
"email" .= rawEmail,
"activated" .= activated,
"accountStatus" .= accStat,
"statusWritetime" .= wt,
"parseError" .= err
]
checkOffender _ = pure Nothing -- impossible

pageSize :: Int32
pageSize = 10000

type UserDetailsRow = (UserId, Maybe Bool, Maybe AccountStatus, Maybe (Writetime AccountStatus), Maybe Text)

type Offender =
-- | row plus parse-error msg
(UserDetailsRow, String)

getUsers :: ConduitM () [UserDetailsRow] Client ()
getUsers = paginateC cql (paramsP LocalQuorum () pageSize) x5
where
cql :: PrepQuery R () UserDetailsRow
cql = "SELECT id, activated, status, writetime(status), email from user"

data WithWritetime a = WithWritetime
{ value :: a,
writetime :: Writetime a
}
deriving (Generic)

instance (ToJSON a) => ToJSON (WithWritetime a)

data UserDetails = UserDetails
{ id_ :: UserId,
activated :: Maybe Bool,
accountStatus :: Maybe (WithWritetime AccountStatus),
email :: Maybe Text
}
deriving (Generic)

instance ToJSON UserDetails

mkUserDetails :: UserDetailsRow -> UserDetails
mkUserDetails (uid, activated, accountStatus, accountStateWrite, email) =
UserDetails
{ id_ = uid,
activated = activated,
accountStatus = WithWritetime <$> accountStatus <*> accountStateWrite,
email = email
}
3 changes: 3 additions & 0 deletions tools/db/inconsistencies/src/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import Cassandra.Settings as C
import DanglingHandles qualified
import DanglingUserKeys qualified
import EmailLessUsers qualified
import EmailUnparseableUsers qualified
import HandleLessUsers qualified
import Imports
import Options as O
Expand All @@ -53,6 +54,8 @@ main = do
DanglingUserKeys.runCommand workLogger brig outputFile
DanglingUserKeys (Just (inputFile, repairData)) ->
DanglingUserKeys.runRepair workLogger brig inputFile outputFile repairData
EmailUnparseableUsers ->
EmailUnparseableUsers.runCommand workLogger brig outputFile
MissingEmailUserKeys (Just (inputFile, repairData)) ->
EmailLessUsers.runRepair workLogger brig inputFile outputFile repairData
MissingEmailUserKeys Nothing ->
Expand Down
6 changes: 5 additions & 1 deletion tools/db/inconsistencies/src/Options.hs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ data Command
= DanglingHandles (Maybe (FilePath, Bool))
| HandleLessUsers
| DanglingUserKeys (Maybe (FilePath, Bool))
| EmailUnparseableUsers
| MissingEmailUserKeys (Maybe (FilePath, Bool))

optionsParser :: Parser (Command, Settings)
Expand All @@ -49,7 +50,7 @@ optionsParser = (,) <$> commandParser <*> settingsParser
commandParser :: Parser Command
commandParser =
subparser $
danglingHandlesCommand <> handleLessUsersCommand <> danglingKeysCommand <> missingEmailsCommand
danglingHandlesCommand <> handleLessUsersCommand <> danglingKeysCommand <> unparseableEmailsCommand <> missingEmailsCommand

danglingHandlesCommand :: Mod CommandFields Command
danglingHandlesCommand = command "dangling-handles" (info (DanglingHandles <$> optional (inputFileRepairParser "handles")) (progDesc "find handle which shouldn't be claimed"))
Expand All @@ -60,6 +61,9 @@ danglingKeysCommand = command "dangling-keys" (info (DanglingUserKeys <$> option
missingEmailsCommand :: Mod CommandFields Command
missingEmailsCommand = command "missing-email-keys" (info (MissingEmailUserKeys <$> optional (inputFileRepairParser "emails")) (progDesc "find missing email keys (users with emails inside user table but not inside user_keys table)"))

unparseableEmailsCommand :: Mod CommandFields Command
unparseableEmailsCommand = command "unparseable-emails" (info (pure EmailUnparseableUsers) (progDesc "find users with an email stored that cannot be parsed)"))

handleLessUsersCommand :: Mod CommandFields Command
handleLessUsersCommand = command "handle-less-users" (info (pure HandleLessUsers) (progDesc "find users which have a handle in the user table but not in the user_handle table"))

Expand Down