opendal_core/services/huggingface/
backend.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::sync::Arc;
19
20use bytes::Buf;
21use http::Response;
22use http::StatusCode;
23use log::debug;
24
25use super::HUGGINGFACE_SCHEME;
26use super::config::HuggingfaceConfig;
27use super::core::HuggingfaceCore;
28use super::core::HuggingfaceStatus;
29use super::error::parse_error;
30use super::lister::HuggingfaceLister;
31use crate::raw::*;
32use crate::*;
33
34/// [Huggingface](https://huggingface.co/docs/huggingface_hub/package_reference/hf_api)'s API support.
35#[doc = include_str!("docs.md")]
36#[derive(Debug, Default)]
37pub struct HuggingfaceBuilder {
38    pub(super) config: HuggingfaceConfig,
39}
40
41impl HuggingfaceBuilder {
42    /// Set repo type of this backend. Default is model.
43    ///
44    /// Available values:
45    /// - model
46    /// - dataset
47    /// - datasets (alias for dataset)
48    /// - space
49    ///
50    /// [Reference](https://huggingface.co/docs/hub/repositories)
51    pub fn repo_type(mut self, repo_type: &str) -> Self {
52        if !repo_type.is_empty() {
53            self.config.repo_type = Some(repo_type.to_string());
54        }
55        self
56    }
57
58    /// Set repo id of this backend. This is required.
59    ///
60    /// Repo id consists of the account name and the repository name.
61    ///
62    /// For example, model's repo id looks like:
63    /// - meta-llama/Llama-2-7b
64    ///
65    /// Dataset's repo id looks like:
66    /// - databricks/databricks-dolly-15k
67    pub fn repo_id(mut self, repo_id: &str) -> Self {
68        if !repo_id.is_empty() {
69            self.config.repo_id = Some(repo_id.to_string());
70        }
71        self
72    }
73
74    /// Set revision of this backend. Default is main.
75    ///
76    /// Revision can be a branch name or a commit hash.
77    ///
78    /// For example, revision can be:
79    /// - main
80    /// - 1d0c4eb
81    pub fn revision(mut self, revision: &str) -> Self {
82        if !revision.is_empty() {
83            self.config.revision = Some(revision.to_string());
84        }
85        self
86    }
87
88    /// Set root of this backend.
89    ///
90    /// All operations will happen under this root.
91    pub fn root(mut self, root: &str) -> Self {
92        self.config.root = if root.is_empty() {
93            None
94        } else {
95            Some(root.to_string())
96        };
97
98        self
99    }
100
101    /// Set the token of this backend.
102    ///
103    /// This is optional.
104    pub fn token(mut self, token: &str) -> Self {
105        if !token.is_empty() {
106            self.config.token = Some(token.to_string());
107        }
108        self
109    }
110
111    /// configure the Hub base url. You might want to set this variable if your
112    /// organization is using a Private Hub https://huggingface.co/enterprise
113    ///
114    /// Default is "https://huggingface.co"
115    pub fn endpoint(mut self, endpoint: &str) -> Self {
116        if !endpoint.is_empty() {
117            self.config.endpoint = Some(endpoint.to_string());
118        }
119        self
120    }
121}
122
123impl Builder for HuggingfaceBuilder {
124    type Config = HuggingfaceConfig;
125
126    /// Build a HuggingfaceBackend.
127    fn build(self) -> Result<impl Access> {
128        debug!("backend build started: {:?}", &self);
129
130        let repo_type = match self.config.repo_type.as_deref() {
131            Some("model") => Ok(RepoType::Model),
132            Some("dataset") | Some("datasets") => Ok(RepoType::Dataset),
133            Some("space") => Ok(RepoType::Space),
134            Some(repo_type) => Err(Error::new(
135                ErrorKind::ConfigInvalid,
136                format!("unknown repo_type: {repo_type}").as_str(),
137            )
138            .with_operation("Builder::build")
139            .with_context("service", HUGGINGFACE_SCHEME)),
140            None => Ok(RepoType::Model),
141        }?;
142        debug!("backend use repo_type: {:?}", &repo_type);
143
144        let repo_id = match &self.config.repo_id {
145            Some(repo_id) => Ok(repo_id.clone()),
146            None => Err(Error::new(ErrorKind::ConfigInvalid, "repo_id is empty")
147                .with_operation("Builder::build")
148                .with_context("service", HUGGINGFACE_SCHEME)),
149        }?;
150        debug!("backend use repo_id: {}", &repo_id);
151
152        let revision = match &self.config.revision {
153            Some(revision) => revision.clone(),
154            None => "main".to_string(),
155        };
156        debug!("backend use revision: {}", &revision);
157
158        let root = normalize_root(&self.config.root.unwrap_or_default());
159        debug!("backend use root: {}", &root);
160
161        let token = self.config.token.as_ref().cloned();
162
163        let endpoint = match &self.config.endpoint {
164            Some(endpoint) => endpoint.clone(),
165            None => {
166                // Try to read from HF_ENDPOINT env var which is used
167                // by the official huggingface clients.
168                if let Ok(env_endpoint) = std::env::var("HF_ENDPOINT") {
169                    env_endpoint
170                } else {
171                    "https://huggingface.co".to_string()
172                }
173            }
174        };
175        debug!("backend use endpoint: {}", &endpoint);
176
177        Ok(HuggingfaceBackend {
178            core: Arc::new(HuggingfaceCore {
179                info: {
180                    let am = AccessorInfo::default();
181                    am.set_scheme(HUGGINGFACE_SCHEME)
182                        .set_native_capability(Capability {
183                            stat: true,
184                            read: true,
185                            list: true,
186                            list_with_recursive: true,
187                            shared: true,
188                            ..Default::default()
189                        });
190                    am.into()
191                },
192                repo_type,
193                repo_id,
194                revision,
195                root,
196                token,
197                endpoint,
198            }),
199        })
200    }
201}
202
203/// Backend for Huggingface service
204#[derive(Debug, Clone)]
205pub struct HuggingfaceBackend {
206    core: Arc<HuggingfaceCore>,
207}
208
209impl Access for HuggingfaceBackend {
210    type Reader = HttpBody;
211    type Writer = ();
212    type Lister = oio::PageLister<HuggingfaceLister>;
213    type Deleter = ();
214
215    fn info(&self) -> Arc<AccessorInfo> {
216        self.core.info.clone()
217    }
218
219    async fn stat(&self, path: &str, _: OpStat) -> Result<RpStat> {
220        // Stat root always returns a DIR.
221        if path == "/" {
222            return Ok(RpStat::new(Metadata::new(EntryMode::DIR)));
223        }
224
225        let resp = self.core.hf_path_info(path).await?;
226
227        let status = resp.status();
228
229        match status {
230            StatusCode::OK => {
231                let mut meta = parse_into_metadata(path, resp.headers())?;
232                let bs = resp.into_body();
233
234                let decoded_response: Vec<HuggingfaceStatus> =
235                    serde_json::from_reader(bs.reader()).map_err(new_json_deserialize_error)?;
236
237                // NOTE: if the file is not found, the server will return 200 with an empty array
238                if let Some(status) = decoded_response.first() {
239                    if let Some(commit_info) = status.last_commit.as_ref() {
240                        meta.set_last_modified(commit_info.date.parse::<Timestamp>()?);
241                    }
242
243                    meta.set_content_length(status.size);
244
245                    // Use LFS OID as ETag if available, otherwise use regular OID
246                    let etag = if let Some(lfs) = &status.lfs {
247                        &lfs.oid
248                    } else {
249                        &status.oid
250                    };
251                    meta.set_etag(etag);
252
253                    match status.type_.as_str() {
254                        "directory" => meta.set_mode(EntryMode::DIR),
255                        "file" => meta.set_mode(EntryMode::FILE),
256                        _ => return Err(Error::new(ErrorKind::Unexpected, "unknown status type")),
257                    };
258                } else {
259                    return Err(Error::new(ErrorKind::NotFound, "path not found"));
260                }
261
262                Ok(RpStat::new(meta))
263            }
264            _ => Err(parse_error(resp)),
265        }
266    }
267
268    async fn read(&self, path: &str, args: OpRead) -> Result<(RpRead, Self::Reader)> {
269        let resp = self.core.hf_resolve(path, args.range(), &args).await?;
270
271        let status = resp.status();
272
273        match status {
274            StatusCode::OK | StatusCode::PARTIAL_CONTENT => {
275                Ok((RpRead::default(), resp.into_body()))
276            }
277            _ => {
278                let (part, mut body) = resp.into_parts();
279                let buf = body.to_buffer().await?;
280                Err(parse_error(Response::from_parts(part, buf)))
281            }
282        }
283    }
284
285    async fn list(&self, path: &str, args: OpList) -> Result<(RpList, Self::Lister)> {
286        let l = HuggingfaceLister::new(self.core.clone(), path.to_string(), args.recursive());
287
288        Ok((RpList::default(), oio::PageLister::new(l)))
289    }
290}
291
292/// Repository type of Huggingface. Supports `model`, `dataset`, and `space`.
293/// [Reference](https://huggingface.co/docs/hub/repositories)
294#[derive(Debug, Clone, Copy)]
295pub enum RepoType {
296    Model,
297    Dataset,
298    Space,
299}
300
301#[cfg(test)]
302mod tests {
303    use super::*;
304
305    #[test]
306    fn build_accepts_datasets_alias() {
307        HuggingfaceBuilder::default()
308            .repo_id("org/repo")
309            .repo_type("datasets")
310            .build()
311            .expect("builder should accept datasets alias");
312    }
313
314    #[test]
315    fn build_accepts_space_repo_type() {
316        HuggingfaceBuilder::default()
317            .repo_id("org/space")
318            .repo_type("space")
319            .build()
320            .expect("builder should accept space repo type");
321    }
322}