Feat/parent child retrieval (#12086)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: AkaraChen <akarachen@outlook.com>
Co-authored-by: nite-knite <nkCoding@gmail.com>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Warren Chen <warren.chen830@gmail.com>
Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: Yi Xiao <54782454+YIXIAO0@users.noreply.github.com>
Co-authored-by: yihong <zouzou0208@gmail.com>
Co-authored-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: KVOJJJin <jzongcode@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: JzoNgKVO <27049666+JzoNgKVO@users.noreply.github.com>
Co-authored-by: Charlie.Wei <luowei@cvte.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Co-authored-by: huayaoyue6 <huayaoyue@163.com>
Co-authored-by: kurokobo <kuro664@gmail.com>
Co-authored-by: Matsuda <yiyth.fcb6@gmail.com>
Co-authored-by: shirochan <s.yusuke0711@gmail.com>
Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com>
Co-authored-by: Huỳnh Gia Bôi <boihuynh147@gmail.com>
Co-authored-by: Julian Huynh <julian.huynh@immersio.io>
Co-authored-by: Hash Brown <hi@xzd.me>
Co-authored-by: 非法操作 <hjlarry@163.com>
Co-authored-by: Kazuki Takamatsu <kazuki.takamatsu@chowagiken.co.jp>
Co-authored-by: Trey Dong <1346650911@qq.com>
Co-authored-by: VoidIsVoid <343750470@qq.com>
Co-authored-by: Gimling <huangjl@ruyi.ai>
Co-authored-by: xiandan-erizo <xiandan.erizo@gmail.com>
Co-authored-by: Muneyuki Noguchi <nogu.dev@gmail.com>
Co-authored-by: zhaobingshuang <1475195565@qq.com>
Co-authored-by: zhaobs <zhaobs@cailian.net>
Co-authored-by: suzuki.sh <s2terminal@users.noreply.github.com>
Co-authored-by: Yingchun Lai <laiyingchun@apache.org>
Co-authored-by: huanshare <huanshare@live.com>
Co-authored-by: huanshare <liuhuan101@longfor.com>
Co-authored-by: orangeclk <orangeclk@users.noreply.github.com>
Co-authored-by: 문정현 <120004247+JungHyunMoon@users.noreply.github.com>
Co-authored-by: barabicu <kztk533@gmail.com>
Co-authored-by: Wei Mingzhi <whistler_wmz@users.sf.net>
Co-authored-by: Paul van Oorschot <20116814+pvoo@users.noreply.github.com>
Co-authored-by: zkyTech <zhangkunyuan@hotmail.com>
Co-authored-by: zhangkunyuan <zhangkunyuan@cmhi.chinamobile.com>
Co-authored-by: Tommy <34446820+Asterovim@users.noreply.github.com>
Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
Co-authored-by: Novice <857526207@qq.com>
Co-authored-by: Novice Lee <novicelee@NovicedeMacBook-Pro.local>
Co-authored-by: Novice Lee <novicelee@NoviPro.local>
Co-authored-by: zxhlyh <16177003+zxhlyh@users.noreply.github.com>
Co-authored-by: liuzhenghua <1090179900@qq.com>
Co-authored-by: Jiang <65766008+AlwaysBluer@users.noreply.github.com>
Co-authored-by: jiangzhijie <jiangzhijie.jzj@alibaba-inc.com>
Co-authored-by: Joe <79627742+ZhouhaoJiang@users.noreply.github.com>
Co-authored-by: Alok Shrivastwa <alok.shrivastwa@gmail.com>
Co-authored-by: Alok Shrivastwa <Alok.Shrivastwa@microland.com>
Co-authored-by: JasonVV <jasonwangiii@outlook.com>
Co-authored-by: Hiroshi Fujita <fujita-h@users.noreply.github.com>
Co-authored-by: Kevin9703 <51311316+Kevin9703@users.noreply.github.com>
Co-authored-by: NFish <douxc512@gmail.com>
Co-authored-by: Junyan Qin <1010553892@qq.com>
Co-authored-by: IWAI, Masaharu <iwaim.sub@gmail.com>
Co-authored-by: IWAI, Masaharu <iwai_masaharu@funkit.co.jp>
Co-authored-by: Bowen Liang <liangbowen@gf.com.cn>
Co-authored-by: luckylhb90 <luckylhb90@gmail.com>
Co-authored-by: hobo.l <hobo.l@binance.com>
Co-authored-by: douxc <7553076+douxc@users.noreply.github.com>
This commit is contained in:
Wu Tianwei
2024-12-25 18:17:15 +08:00
committed by GitHub
parent 39ace9bdee
commit 2b2263a349
216 changed files with 9066 additions and 3116 deletions

View File

@@ -1,6 +1,7 @@
import type { DataSourceNotionPage, DataSourceProvider } from './common'
import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
import type { Tag } from '@/app/components/base/tag-management/constant'
import type { IndexingType } from '@/app/components/datasets/create/step-two'
export enum DataSourceType {
FILE = 'upload_file',
@@ -10,6 +11,12 @@ export enum DataSourceType {
export type DatasetPermission = 'only_me' | 'all_team_members' | 'partial_members'
export enum ChunkingMode {
'text' = 'text_model', // General text
'qa' = 'qa_model', // General QA
'parentChild' = 'hierarchical_model', // Parent-Child
}
export type DataSet = {
id: string
name: string
@@ -18,11 +25,12 @@ export type DataSet = {
description: string
permission: DatasetPermission
data_source_type: DataSourceType
indexing_technique: 'high_quality' | 'economy'
indexing_technique: IndexingType
created_by: string
updated_by: string
updated_at: number
app_count: number
doc_form: ChunkingMode
document_count: number
word_count: number
provider: string
@@ -95,6 +103,12 @@ export type CustomFile = File & {
created_at?: number
}
export type DocumentItem = {
id: string
name: string
extension: string
}
export type CrawlOptions = {
crawl_sub_pages: boolean
only_main_content: boolean
@@ -144,7 +158,7 @@ export type IndexingEstimateResponse = {
total_price: number
currency: string
total_segments: number
preview: string[]
preview: Array<{ content: string; child_chunks: string[] }>
qa_preview?: QA[]
}
@@ -170,7 +184,12 @@ export type IndexingStatusBatchResponse = {
data: IndexingStatusResponse[]
}
export type ProcessMode = 'automatic' | 'custom'
export enum ProcessMode {
general = 'custom',
parentChild = 'hierarchical',
}
export type ParentMode = 'full-doc' | 'paragraph'
export type ProcessRuleResponse = {
mode: ProcessMode
@@ -181,6 +200,8 @@ export type ProcessRuleResponse = {
export type Rules = {
pre_processing_rules: PreProcessingRule[]
segmentation: Segmentation
parent_mode: ParentMode
subchunk_segmentation: Segmentation
}
export type Limits = {
@@ -195,7 +216,7 @@ export type PreProcessingRule = {
export type Segmentation = {
separator: string
max_tokens: number
chunk_overlap: number
chunk_overlap?: number
}
export const DocumentIndexingStatusList = [
@@ -258,13 +279,14 @@ export type InitialDocumentDetail = {
display_status: DocumentDisplayStatus
completed_segments?: number
total_segments?: number
doc_form: 'text_model' | 'qa_model'
doc_form: ChunkingMode
doc_language: string
}
export type SimpleDocumentDetail = InitialDocumentDetail & {
enabled: boolean
word_count: number
is_qa: boolean // TODO waiting for backend to add this field
error?: string | null
archived: boolean
updated_at: number
@@ -289,7 +311,7 @@ export type DocumentListResponse = {
export type DocumentReq = {
original_document_id?: string
indexing_technique?: string
doc_form: 'text_model' | 'qa_model'
doc_form: ChunkingMode
doc_language: string
process_rule: ProcessRule
}
@@ -331,7 +353,7 @@ export type NotionPage = {
}
export type ProcessRule = {
mode: string
mode: ProcessMode
rules: Rules
}
@@ -341,6 +363,11 @@ export type createDocumentResponse = {
documents: InitialDocumentDetail[]
}
export type PrecessRule = {
mode: ProcessMode
rules: Rules
}
export type FullDocumentDetail = SimpleDocumentDetail & {
batch: string
created_api_request_id: string
@@ -363,6 +390,8 @@ export type FullDocumentDetail = SimpleDocumentDetail & {
doc_type?: DocType | null | 'others'
doc_metadata?: DocMetadata | null
segment_count: number
dataset_process_rule: PrecessRule
document_process_rule: ProcessRule
[key: string]: any
}
@@ -399,12 +428,12 @@ export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
export type SegmentsQuery = {
last_id?: string
page?: string
limit: number
// status?: SegmentStatus
hit_count_gte?: number
keyword?: string
enabled?: boolean
enabled?: boolean | 'all'
}
export type SegmentDetailModel = {
@@ -429,6 +458,8 @@ export type SegmentDetailModel = {
error: string | null
stopped_at: number
answer?: string
child_chunks?: ChildChunkDetail[]
updated_at: number
}
export type SegmentsResponse = {
@@ -436,6 +467,8 @@ export type SegmentsResponse = {
has_more: boolean
limit: number
total: number
total_pages: number
page: number
}
export type HitTestingRecord = {
@@ -448,10 +481,18 @@ export type HitTestingRecord = {
created_at: number
}
export type HitTestingChildChunk = {
id: string
content: string
position: number
score: number
}
export type HitTesting = {
segment: Segment
content: Segment
score: number
tsne_position: TsnePosition
child_chunks?: HitTestingChildChunk[] | null
}
export type ExternalKnowledgeBaseHitTesting = {
@@ -530,11 +571,7 @@ export type SegmentUpdater = {
content: string
answer?: string
keywords?: string[]
}
export enum DocForm {
TEXT = 'text_model',
QA = 'qa_model',
regenerate_child_chunks?: boolean
}
export type ErrorDocsResponse = {
@@ -579,3 +616,49 @@ export const DEFAULT_WEIGHTED_SCORE = {
keyword: 0.3,
},
}
export type ChildChunkType = 'automatic' | 'customized'
export type ChildChunkDetail = {
id: string
position: number
segment_id: string
content: string
word_count: number
created_at: number
updated_at: number
type: ChildChunkType
}
export type ChildSegmentsResponse = {
data: ChildChunkDetail[]
total: number
total_pages: number
page: number
limit: number
}
export type UpdateDocumentParams = {
datasetId: string
documentId: string
}
// Used in api url
export enum DocumentActionType {
enable = 'enable',
disable = 'disable',
archive = 'archive',
unArchive = 'un_archive',
delete = 'delete',
}
export type UpdateDocumentBatchParams = {
datasetId: string
documentId?: string
documentIds?: string[] | string
}
export type BatchImportResponse = {
job_id: string
job_status: string
}