Вообщем - если данных много, и текста большие, то такое сравнение выполняется очень долго.
Есть ли возможность разрулить нагрузку на уровне php + sql? Или придется педалить на другой технологии?
CREATE TABLE public.pending_news (
id integer NOT NULL,
source_id integer NOT NULL,
pq_id bigint,
title text NOT NULL,
content text NOT NULL,
search_content text NOT NULL,
thumb_src text,
status public.pending_news_enum_status NOT NULL,
group_hash character varying(45) NOT NULL,
processed smallint,
created_at timestamp with time zone NOT NULL,
update_at timestamp with time zone NOT NULL,
additonal_data json,
search_vector tsvector
);
ALTER TABLE public.pending_news OWNER TO postgres;
CREATE SEQUENCE public.pending_news_id_seq
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
ALTER TABLE public.pending_news_id_seq OWNER TO postgres;
ALTER SEQUENCE public.pending_news_id_seq OWNED BY public.pending_news.id;
ALTER TABLE ONLY public.pending_news ALTER COLUMN id SET DEFAULT nextval('public.pending_news_id_seq'::regclass);
ALTER TABLE ONLY public.pending_news
ADD CONSTRAINT pending_news_pkey PRIMARY KEY (id);
CREATE INDEX content_search_idx ON public.pending_news USING gin (to_tsvector('russian'::regconfig, search_content));
CREATE INDEX gin_search_vector ON public.pending_news USING gin (search_vector);
CREATE INDEX pending_news_fk_pending_news_1_idx ON public.pending_news USING btree (source_id);
CREATE UNIQUE INDEX pending_news_fk_pending_news_2_idx ON public.pending_news USING btree (pq_id);
CREATE INDEX pending_news_key_hash ON public.pending_news USING btree (group_hash);
CREATE INDEX pending_news_process ON public.pending_news USING btree (processed);
CREATE INDEX pending_news_status_created ON public.pending_news USING btree (status, processed, created_at);
CREATE INDEX search_content_gist ON public.pending_news USING gist (to_tsvector('russian'::regconfig, search_content));
CREATE INDEX title_gist ON public.pending_news USING gist (to_tsvector('russian'::regconfig, title));
CREATE INDEX title_idx ON public.pending_news USING gin (to_tsvector('russian'::regconfig, title));
SELECT pn2.id, pn2.title, similarity(pn1.search_content , pn2.search_content ) AS sml
FROM pending_news AS pn1
INNER JOIN pending_news AS pn2 ON
pn2.created_at > (pn1.created_at - interval '24 hours')
AND pn2.id <> 6126286
AND pn1.search_content % pn2.search_content
WHERE
pn1.id = 6126286
AND pn1.created_at >= pn2.created_at
order by pn1.search_content <-> pn2.search_content desc