Purpose
The purpose of this tutorial is to show an example of filtering selective HTML code from string content without resorting
to C#, blind encoding (and then later decoding) or a prefilter process before the content gets to the database. This is
particularly useful when you are using automation that generates HTML which needs to be preserved when going to a webpage
or another target where you do not want to encumber the server with the need to parse or unencode that content prior to
HTML rendering (saving server resources)...but, at the same time, you want to catch potentially unsafe html code before it
makes its way into the database.
This set of TSQL demonstrates how something like that may be accomplished. However, as a tutorial, some extra material has
been introduced to better illustrate how it all works which is best experienced by copying and pasting it into SSMS and
revising it as needed.
To summarize, the filter looks for potentially unsafe "script", "iframe", "a href" tags and removes them while preserving
other html code as-is. Ideally the filter is meant to be used for data just before it goes into a database, alleviating
the need to perform operations on that data when it leaves the database in those cases where content was substituted,
selectively encoded (which would require a decode on the way out), blanket encoded and so forth.
-- The easiest way to apply a filter is to use the SQL XML function like "SELECT '< then >' FOR XML PATH('');" which will encode to "< then >" but the entire string would be blindly encoded all at once.
-- This shows an example of selectively filtering HTML.
CREATE TABLE #filter (
Id INT PRIMARY KEY IDENTITY(1,1)
,TagSyntax VARCHAR(50) NOT NULL
,InsertedDate DATETIME NOT NULL DEFAULT CAST(GETDATE() AS DATETIME)
);
-- If you do not store the tags to filter in a regular table, you can insert them explicitly
INSERT INTO #filter (TagSyntax) VALUES
('script')
,('iframe')
,('a href')
,('/a');
-- Otherwise you can insert the tages from a regular table wit something similar to the following. NOTE: If you use this method then you do not need the CREATE or INSERT code above.
-- SELECT Id, TagSyntax
-- INTO #filter
-- FROM TableHoldingFilters;
-- Extraneous table present to help visualize how this all works
CREATE TABLE #letters (
Id INT IDENTITY(1,1)
,Letter VARCHAR(1) DEFAULT NULL
);
CREATE TABLE #tagPositions (
Id INT IDENTITY(1,1)
,StartPosition INT DEFAULT NULL
,EndPosition INT DEFAULT NULL
,TagSyntax VARCHAR(500) DEFAULT NULL
,TagModSyntax VARCHAR(500) DEFAULT NULL
);
-- An example of the string we want to selectively filter some HTML out of, while preserving other HTML
DECLARE @inputData VARCHAR(MAX) = 'The < scriP t src="blabla">something big with tasty noodles and <iframe src="url..." width="10px" height="10px" /><a href="flame-off">biscuits</a>.';
-- Deconstruct string and locate tag positions
DECLARE @tStep INT = 1;
DECLARE @tStepMax INT = LEN(@inputData);
DECLARE @startPosition INT = NULL;
DECLARE @endPosition INT = NULL;
DECLARE @tagSyntax VARCHAR(500) = NULL;
WHILE (@tStep <= @tStepMax)
BEGIN
DECLARE @currentChar VARCHAR(1) = SUBSTRING(@inputData, @tStep, 1);
INSERT INTO #letters (Letter) VALUES (@currentChar);
IF NOT @tagSyntax IS NULL
BEGIN
SET @tagSyntax = CONCAT(@tagSyntax, @currentChar);
END
IF @currentChar = '<' AND @startPosition IS NULL
BEGIN
SET @startPosition = @tStep;
SET @tagSyntax = @currentChar;
END
IF @currentChar = '>' AND NOT @startPosition IS NULL
BEGIN
DECLARE @tagModSyntax VARCHAR(500) = LOWER(REPLACE(@tagSyntax, ' ', ''));
SET @endPosition = @tStep;
INSERT INTO #tagPositions (StartPosition, EndPosition, TagSyntax, TagModSyntax) VALUES (@startPosition, @endPosition, @tagSyntax, @tagModSyntax);
SET @startPosition = NULL;
SET @endPosition = NULL;
SET @tagSyntax = NULL;
END
SET @tStep = @tStep + 1;
END
-- Perform Selective Filtering
DECLARE @filteredData VARCHAR(MAX) = @inputData;
SET @tStep = 1;
SET @tStepMax = (SELECT MAX(Id) FROM #tagPositions);
WHILE (@tStep <= @tStepMax)
BEGIN
-- Get tag present in data
DECLARE @tOriginal VARCHAR(500) = NULL;
DECLARE @tModified VARCHAR(500) = NULL;
SELECT
@tOriginal = TagSyntax
,@tModified = TagModSyntax
FROM #tagPositions WHERE Id = @tStep;
-- Evaluate against filter and remove
DECLARE @fStep INT = 1;
DECLARE @fStepMax INT = (SELECT MAX(Id) FROM #filter);
WHILE (@fStep <= @fStepMax)
BEGIN
DECLARE @filterItem VARCHAR(50) = (SELECT LOWER(REPLACE(TagSyntax, ' ', '')) FROM #filter WHERE Id = @fStep);
IF (@tModified LIKE CONCAT('%', @filterItem, '%'))
BEGIN
SET @filteredData = REPLACE(@filteredData, @tOriginal, '');
END
SET @fStep = @fStep + 1;
END
SET @tStep = @tStep + 1;
END
-- Return temp tables to review
SELECT * FROM #letters;
SELECT * FROM #tagPositions;
-- Clean up
IF (OBJECT_ID('tempdb..#letters') IS NOT NULL)
BEGIN
DROP TABLE #letters;
END
IF (OBJECT_ID('tempdb..#tagPositions') IS NOT NULL)
BEGIN
DROP TABLE #tagPositions;
END
IF (OBJECT_ID('tempdb..#filter') IS NOT NULL)
BEGIN
DROP TABLE #filter;
END
-- Return original data to review
SELECT @inputData;
-- Return selectively filtered data to review
SELECT @filteredData;