feat(website-crawl): add jina reader as additional alternative for website crawling (#8761)

This commit is contained in:
Zhaofeng Miao
2024-09-30 09:57:19 +08:00
committed by GitHub
parent fb49413a41
commit 369e1e6f58
38 changed files with 927 additions and 75 deletions

View File

@@ -1,8 +1,12 @@
'use client'
import type { FC } from 'react'
import React, { useCallback, useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import s from './index.module.css'
import NoData from './no-data'
import Firecrawl from './firecrawl'
import JinaReader from './jina-reader'
import cn from '@/utils/classnames'
import { useModalContext } from '@/context/modal-context'
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
import { fetchDataSources } from '@/service/datasets'
@@ -12,6 +16,7 @@ type Props = {
onPreview: (payload: CrawlResultItem) => void
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
onCrawlProviderChange: (provider: DataSourceProvider) => void
onJobIdChange: (jobId: string) => void
crawlOptions: CrawlOptions
onCrawlOptionsChange: (payload: CrawlOptions) => void
@@ -21,17 +26,32 @@ const Website: FC<Props> = ({
onPreview,
checkedCrawlResult,
onCheckedCrawlResultChange,
onCrawlProviderChange,
onJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}) => {
const { t } = useTranslation()
const { setShowAccountSettingModal } = useModalContext()
const [isLoaded, setIsLoaded] = useState(false)
const [isSetFirecrawlApiKey, setIsSetFirecrawlApiKey] = useState(false)
const [selectedProvider, setSelectedProvider] = useState<DataSourceProvider>(DataSourceProvider.jinaReader)
const [sources, setSources] = useState<DataSourceItem[]>([])
useEffect(() => {
onCrawlProviderChange(selectedProvider)
}, [selectedProvider, onCrawlProviderChange])
const checkSetApiKey = useCallback(async () => {
const res = await fetchDataSources() as any
const isFirecrawlSet = res.sources.some((item: DataSourceItem) => item.provider === DataSourceProvider.fireCrawl)
setIsSetFirecrawlApiKey(isFirecrawlSet)
setSources(res.sources)
// If users have configured one of the providers, select it.
const availableProviders = res.sources.filter((item: DataSourceItem) =>
[DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider),
)
if (availableProviders.length > 0)
setSelectedProvider(availableProviders[0].provider)
}, [])
useEffect(() => {
@@ -52,20 +72,66 @@ const Website: FC<Props> = ({
return (
<div>
{isSetFirecrawlApiKey
? (
<Firecrawl
onPreview={onPreview}
checkedCrawlResult={checkedCrawlResult}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>
)
: (
<NoData onConfig={handleOnConfig} />
)}
<div className="mb-4">
<div className="font-medium text-gray-700 mb-2 h-6">
{t('datasetCreation.stepOne.website.chooseProvider')}
</div>
<div className="flex space-x-2">
<button
className={`px-4 py-2 text-sm font-medium rounded-md flex items-center justify-center ${
selectedProvider === DataSourceProvider.jinaReader
? 'bg-primary-50 text-primary-600'
: 'bg-gray-100 text-gray-600 hover:bg-gray-200'
}`}
onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)}
>
<span className={cn(s.jinaLogo, 'mr-2')} />
<span>Jina Reader</span>
</button>
<button
className={`px-4 py-2 text-sm font-medium rounded-md ${
selectedProvider === DataSourceProvider.fireCrawl
? 'bg-primary-50 text-primary-600'
: 'bg-gray-100 text-gray-600 hover:bg-gray-200'
}`}
onClick={() => setSelectedProvider(DataSourceProvider.fireCrawl)}
>
🔥 Firecrawl
</button>
</div>
</div>
{
selectedProvider === DataSourceProvider.fireCrawl
? sources.find(source => source.provider === DataSourceProvider.fireCrawl)
? (
<Firecrawl
onPreview={onPreview}
checkedCrawlResult={checkedCrawlResult}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>
)
: (
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
)
: sources.find(source => source.provider === DataSourceProvider.jinaReader)
? (
<JinaReader
onPreview={onPreview}
checkedCrawlResult={checkedCrawlResult}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>
)
: (
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
)
}
</div>
)
}