sithuWiki commited on
Commit
a8a9175
·
verified ·
1 Parent(s): 32e930f

update preprocessing.py

Browse files
Files changed (1) hide show
  1. preprocessing.py +135 -61
preprocessing.py CHANGED
@@ -17,82 +17,156 @@ def engineer_features(blockchain_df):
17
  return df
18
 
19
 
20
- def prepare_miner_features(blockchain_df, miner_name, miner_price, region='texas'):
21
- """Add miner-specific features - EXACTLY 14 features"""
22
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  df = blockchain_df.copy()
24
  specs = MINER_SPECS[miner_name]
25
-
26
  # Keep only these columns from blockchain data
27
- df = df[['date', 'bitcoin_price', 'difficulty', 'fees', 'hashrate', 'revenue', 'block_reward']].copy()
28
- df['date'] = pd.to_datetime(df['date'])
29
-
30
- # Add miner features
31
- df['machine_price'] = miner_price
32
- df['machine_hashrate'] = specs['hashrate']
33
- df['power'] = specs['power']
34
- df['efficiency'] = specs['efficiency']
35
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  # Calculate age_days (days since miner was released)
37
- release_date = pd.to_datetime(specs['release_date'])
38
- df['age_days'] = (df['date'] - release_date).dt.days
39
-
40
  # Days since halving
41
- df['days_since_halving'] = df['date'].apply(get_days_since_halving)
42
-
43
- # Revenue potential
44
- hashrate_hs = df['machine_hashrate'] * 1e12
45
- btc_per_day = (hashrate_hs * 86400) / (df['difficulty'] * (2**32)) * (df['block_reward'] + (df['fees']/144))
46
- df['Revenue_Potential'] = btc_per_day * df['bitcoin_price']
47
-
48
- # Electricity rate
49
- # df['electricity_rate'] = ELECTRICITY_RATES.get(region, 0.10)
50
- df['electricity_rate'] = df['date'].dt.date.apply(
51
- lambda day: get_electricity_rate(region, day)
52
  )
53
-
 
 
 
 
 
 
 
 
 
 
 
 
54
  return df
55
 
56
 
57
- def get_latest_sequence(blockchain_df, miner_name, miner_price, region='texas', window_size=30):
58
- """Get the most recent sequence for prediction - EXACTLY 14 features in CORRECT ORDER"""
59
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  df_features = engineer_features(blockchain_df)
61
- df_miner = prepare_miner_features(df_features, miner_name, miner_price, region)
62
-
 
 
 
 
 
 
 
 
 
63
  # CRITICAL: This order MUST match your training data CSV exactly!
64
- # Your training CSV: bitcoin_price,difficulty,fees,hashrate,revenue,machine_price,machine_hashrate,power,efficiency,block_reward,age_days,days_since_halving,Revenue_Potential,electricity_rate
65
  feature_cols = [
66
- 'bitcoin_price', # 1
67
- 'difficulty', # 2
68
- 'fees', # 3
69
- 'hashrate', # 4
70
- 'revenue', # 5
71
- 'machine_price', # 6
72
- 'machine_hashrate', # 7
73
- 'power', # 8
74
- 'efficiency', # 9
75
- 'block_reward', # 10
76
- 'age_days', # 11
77
- 'days_since_halving',# 12
78
- 'Revenue_Potential', # 13
79
- 'electricity_rate' # 14
80
  ]
81
-
82
  df_miner = df_miner.dropna().reset_index(drop=True)
83
-
84
  if len(df_miner) < window_size:
85
- raise ValueError(f"Not enough data: need {window_size} days, have {len(df_miner)}")
86
-
87
- # Get last window_size days with exactly 14 features
88
- sequence = df_miner[feature_cols].values[-window_size:]
89
- latest_date = df_miner['date'].iloc[-1]
90
-
91
- # Verify shape
92
- if sequence.shape[1] != 14:
93
- raise ValueError(f"Expected 14 features, got {sequence.shape[1]}")
94
-
95
- return sequence, feature_cols, latest_date
96
 
97
 
98
 
 
17
  return df
18
 
19
 
20
+ def prepare_miner_features(
21
+ blockchain_df,
22
+ miner_name,
23
+ miner_price,
24
+ region="texas",
25
+ machine_hashrate=None,
26
+ power=None,
27
+ efficiency=None,
28
+ electricity_rate=None,
29
+ ):
30
+ """
31
+ Add miner-specific features - EXACTLY 14 features.
32
+
33
+ Now uses user-specified:
34
+ - machine_price
35
+ - machine_hashrate
36
+ - power
37
+ - efficiency
38
+ - electricity_rate
39
+
40
+ If any of these are None, we fall back to MINER_SPECS / region,
41
+ but for your app you will always pass explicit values.
42
+ """
43
  df = blockchain_df.copy()
44
  specs = MINER_SPECS[miner_name]
45
+
46
  # Keep only these columns from blockchain data
47
+ df = df[[
48
+ "date",
49
+ "bitcoin_price",
50
+ "difficulty",
51
+ "fees",
52
+ "hashrate",
53
+ "revenue",
54
+ "block_reward",
55
+ ]].copy()
56
+ df["date"] = pd.to_datetime(df["date"])
57
+
58
+ # ---- user-provided constants (same value for all 30 days) ----
59
+ df["machine_price"] = float(miner_price)
60
+
61
+ if machine_hashrate is not None:
62
+ df["machine_hashrate"] = float(machine_hashrate)
63
+ else:
64
+ df["machine_hashrate"] = specs["hashrate"]
65
+
66
+ if power is not None:
67
+ df["power"] = float(power)
68
+ else:
69
+ df["power"] = specs["power"]
70
+
71
+ if efficiency is not None:
72
+ df["efficiency"] = float(efficiency)
73
+ else:
74
+ df["efficiency"] = specs["efficiency"]
75
+
76
  # Calculate age_days (days since miner was released)
77
+ release_date = pd.to_datetime(specs["release_date"])
78
+ df["age_days"] = (df["date"] - release_date).dt.days
79
+
80
  # Days since halving
81
+ df["days_since_halving"] = df["date"].apply(get_days_since_halving)
82
+
83
+ # Revenue potential (same as your original code)
84
+ hashrate_hs = df["machine_hashrate"] * 1e12
85
+ btc_per_day = (
86
+ (hashrate_hs * 86400)
87
+ / (df["difficulty"] * (2**32))
88
+ * (df["block_reward"] + (df["fees"] / 144))
 
 
 
89
  )
90
+ df["Revenue_Potential"] = btc_per_day * df["bitcoin_price"]
91
+
92
+ # ---- electricity_rate constant across all rows ----
93
+ if electricity_rate is not None:
94
+ df["electricity_rate"] = float(electricity_rate)
95
+ else:
96
+ df["efficiency"] = specs["electricity_rate"]
97
+
98
+ # # fallback: keep old behaviour if not provided
99
+ # df["electricity_rate"] = df["date"].dt.date.apply(
100
+ # lambda day: get_electricity_rate(region, day)
101
+ # )
102
+
103
  return df
104
 
105
 
106
+
107
+ def get_latest_sequence(
108
+ blockchain_df,
109
+ miner_name,
110
+ miner_price,
111
+ region="texas",
112
+ window_size=30,
113
+ machine_hashrate=None,
114
+ power=None,
115
+ efficiency=None,
116
+ electricity_rate=None,
117
+ ):
118
+ """
119
+ Get the most recent sequence for prediction - EXACTLY 14 features in CORRECT ORDER.
120
+
121
+ Now also accepts user-specified:
122
+ - machine_hashrate
123
+ - power
124
+ - efficiency
125
+ - electricity_rate
126
+ """
127
  df_features = engineer_features(blockchain_df)
128
+ df_miner = prepare_miner_features(
129
+ df_features,
130
+ miner_name,
131
+ miner_price,
132
+ region,
133
+ machine_hashrate=machine_hashrate,
134
+ power=power,
135
+ efficiency=efficiency,
136
+ electricity_rate=electricity_rate,
137
+ )
138
+
139
  # CRITICAL: This order MUST match your training data CSV exactly!
 
140
  feature_cols = [
141
+ "bitcoin_price", # 1
142
+ "difficulty", # 2
143
+ "fees", # 3
144
+ "hashrate", # 4
145
+ "revenue", # 5
146
+ "machine_price", # 6
147
+ "machine_hashrate", # 7
148
+ "power", # 8
149
+ "efficiency", # 9
150
+ "block_reward", # 10
151
+ "age_days", # 11
152
+ "days_since_halving", # 12
153
+ "Revenue_Potential", # 13
154
+ "electricity_rate", # 14
155
  ]
156
+
157
  df_miner = df_miner.dropna().reset_index(drop=True)
158
+
159
  if len(df_miner) < window_size:
160
+ raise ValueError(
161
+ f"Not enough data to build a {window_size}-day window, got {len(df_miner)} rows."
162
+ )
163
+
164
+ df_window = df_miner.tail(window_size).reset_index(drop=True)
165
+ sequence = df_window[feature_cols].values.astype(float)
166
+ pred_date = df_window["date"].iloc[-1]
167
+
168
+ return sequence, df_window, pred_date
169
+
 
170
 
171
 
172